File: run_benchmark.py

package info (click to toggle)
py-stringmatching 0.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,956 kB
  • sloc: python: 3,979; makefile: 174; sh: 7
file content (118 lines) | stat: -rw-r--r-- 5,685 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

from math import ceil, sqrt
import time

import pandas as pd
import matplotlib.pyplot as plt


def run_benchmark(short_dataset_path, medium_dataset_path, long_dataset_path,
                  data_size, sim_measure, tokenizer = None, num_repeat = 1, 
                  random_seed = 0, output_file = None, encoding = 'latin-1'):
    """Run benchmark for 9 configurations (short-short, short-medium, 
    short-long, medium-short, medium-medium, medium-long, long-short, 
    long-medium, long-long) for the provided similarity measure.

    Specifically, this method will take in 3 files as input each containing 
    one column of strings. Next, it will sample the input files based on the 
    provided data_size and then runs benchmark for different configurations for 
    the provided similarity measure. Finally, it returns a dataframe containing
    the benchmark results.                     
                                                                                
    Args:                                                                   
        short_dataset_path (string): Path to the dataset containing short strings.
        medium_dataset_path (string): Path to the dataset containing medium strings.
        long_dataset_path (string): Path to the dataset containing long strings.
        data_size (int): Number of string pairs to be benchmarked.
        sim_measure (function): Similarity function to be benchmarked.
        tokenizer (function): Tokenizer to be used (in case of token-based similarity measures). Defaults to None.
        num_repeat (int): Number of times to run each configuration. Defaults to 1.
        random_seed (int): Random seed to be used for sampling. Defaults to 0.
        output_file (string): Output path to save the benchmark results. Defaults to None.         
        encoding (string): Encoding of the input datasets. Defaults to latin-1.

    Returns:                                                                
        Benchmark results (Dataframe).                                   
                                                                                
    Examples:
        >>> jac = Jaccard()                                                 
        >>> ws = WhitespaceTokenizer(return_set=True)
        >>> results = run_benchmark('datasets/short_strings.csv', 'datasets/medium_strings.csv', 'datasets/long_strings.csv', 100000, 
                jac.get_sim_score, ws.tokenize, output_file = 'result.csv') # Benchmark results will be saved in result.csv
        >>> ed = Levenshtein()
        >>> results = run_benchmark('datasets/short_strings.csv', 'datasets/medium_strings.csv', 'datasets/long_strings.csv', 100000,
                      ed.get_sim_score) 
    """
  
    # read data
    short_strings = pd.read_csv(short_dataset_path, encoding = encoding)
    medium_strings = pd.read_csv(medium_dataset_path, encoding = encoding)                                  
    long_strings = pd.read_csv(long_dataset_path, encoding = encoding)                                  

    short_len = len(short_strings)
    medium_len = len(medium_strings)
    long_len = len(long_strings)

    # compute individual table size
    table_size = ceil(sqrt(data_size))

    # sample strings    
    short_table = list(short_strings.sample(table_size, replace = True, 
                                            random_state = random_seed).values)
    medium_table = list(medium_strings.sample(table_size, replace = True, 
                                              random_state = random_seed).values)
    long_table = list(long_strings.sample(table_size, replace = True, 
                                          random_state = random_seed).values)
    
    tables = [('short', short_table), ('medium', medium_table), 
              ('long', long_table)]

    # run benchmark for each configuration
    bench_output = []
    for i in range(len(tables)):
        for j in range(len(tables)):
            runtimes = profile_runtime(tables[i][1], tables[j][1], tokenizer, 
                                       sim_measure, num_repeat)
            runtimes.append(sum(runtimes)/float(num_repeat))
            runtimes.insert(0, '_'.join([tables[i][0], tables[j][0]]))
            bench_output.append(runtimes)

    header = ['run_'+str(i+1)+' (in secs)' for i in range(num_repeat)]
    header.append('average (in secs)')
    header.insert(0, 'configuration')
    output_table = pd.DataFrame(bench_output, columns = header)

    if output_file:
        output_table.to_csv(output_file, index = False)

    return output_table

 
def profile_runtime(table_A, table_B, tokenizer, sim_measure, num_repeat):
    # run benchmark for one configuration
    runtimes = []
    for i in range(num_repeat):
        start_time = time.time()
        for string1 in table_A:
            for string2 in table_B:
                if tokenizer:
                    score = sim_measure(tokenizer(string1[0]), tokenizer(string2[0]))
                else:
                    score = sim_measure(string1[0], string2[0])
        end_time = time.time()
        runtimes.append(end_time-start_time)
    return runtimes


def plot_benchmark(bench_output, output_file, 
                   conf_attr = 'configuration', time_attr = 'average (in secs)'):
    # Generate plot from benchmark output
    x_range = list(range(len(bench_output)))
    plt.xticks(x_range, list(bench_output[conf_attr]))
    plt.plot(x_range, bench_output[time_attr], marker='o')
    plt.xlabel('Configuration')
    plt.ylabel('Average time (in secs)')
    plt.title('Benchmark plot')
    plt.savefig(output_file)
    print('Plot generated successfully.')