My favorites | Sign in
Project Home Downloads Wiki Issues Source
Checkout   Browse   Changes    
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python

"""Benchmarks for Python's regex engine.

These are some of the original benchmarks used to tune Python's regex engine
in 2000 written by Fredrik Lundh. Retreived from
http://mail.python.org/pipermail/python-dev/2000-August/007797.html and
integrated into Unladen Swallow's perf.py in 2009 by David Laing.

These benchmarks are of interest since they helped to guide the original
optimization of the sre engine, and we shouldn't necessarily ignore them just
because they're "old".
"""

# Python imports
import optparse
import re
import time

# Local imports
import util

# These are the regular expressions to be tested. These sync up,
# index-for-index with the list of strings generated by gen_string_table()
# below.
regexs = [
re.compile('Python|Perl'),
re.compile('Python|Perl'),
re.compile('(Python|Perl)'),
re.compile('(?:Python|Perl)'),
re.compile('Python'),
re.compile('Python'),
re.compile('.*Python'),
re.compile('.*Python.*'),
re.compile('.*(Python)'),
re.compile('.*(?:Python)'),
re.compile('Python|Perl|Tcl'),
re.compile('Python|Perl|Tcl'),
re.compile('(Python|Perl|Tcl)'),
re.compile('(?:Python|Perl|Tcl)'),
re.compile('(Python)\\1'),
re.compile('(Python)\\1'),
re.compile('([0a-z][a-z0-9]*,)+'),
re.compile('(?:[0a-z][a-z0-9]*,)+'),
re.compile('([a-z][a-z0-9]*,)+'),
re.compile('(?:[a-z][a-z0-9]*,)+'),
re.compile('.*P.*y.*t.*h.*o.*n.*')]


def gen_string_table(n):
"""Generates the list of strings that will be used in the benchmarks.

All strings have repeated prefixes and suffices, and n specifies the
number of repetitions.
"""
strings = []
strings.append('-'*n+'Perl'+'-'*n)
strings.append('P'*n+'Perl'+'P'*n)
strings.append('-'*n+'Perl'+'-'*n)
strings.append('-'*n+'Perl'+'-'*n)
strings.append('-'*n+'Python'+'-'*n)
strings.append('P'*n+'Python'+'P'*n)
strings.append('-'*n+'Python'+'-'*n)
strings.append('-'*n+'Python'+'-'*n)
strings.append('-'*n+'Python'+'-'*n)
strings.append('-'*n+'Python'+'-'*n)
strings.append('-'*n+'Perl'+'-'*n)
strings.append('P'*n+'Perl'+'P'*n)
strings.append('-'*n+'Perl'+'-'*n)
strings.append('-'*n+'Perl'+'-'*n)
strings.append('-'*n+'PythonPython'+'-'*n)
strings.append('P'*n+'PythonPython'+'P'*n)
strings.append('-'*n+'a5,b7,c9,'+'-'*n)
strings.append('-'*n+'a5,b7,c9,'+'-'*n)
strings.append('-'*n+'a5,b7,c9,'+'-'*n)
strings.append('-'*n+'a5,b7,c9,'+'-'*n)
strings.append('-'*n+'Python'+'-'*n)
return strings

# A cache for the generated strings.
string_tables = {}

def init_benchmarks(n_values=None):
"""Initialize the strings we'll run the regexes against.

The strings used in the benchmark are prefixed and suffixed by
strings that are repeated n times.

The sequence n_values contains the values for n.
If n_values is None the values of n from the original benchmark
are used.

The generated list of strings is cached in the string_tables
variable, which is indexed by n.

Returns:
A list of string prefix/suffix lengths.
"""
if n_values is None:
n_values = [0, 5, 50, 250, 1000, 5000, 10000]

for n in n_values:
string_tables[n] = gen_string_table(n)
return n_values


def run_benchmarks(n):
"""Runs all of the benchmarks for a given value of n."""
for id in xrange(len(regexs)):
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])
re.search(regexs[id], string_tables[n][id])


def test_regex_effbot(iterations):
sizes = init_benchmarks()

# Warm up.
for size in sizes:
run_benchmarks(size)

times = []
for i in xrange(iterations):
t0 = time.time()
for size in sizes:
run_benchmarks(size)
t1 = time.time()
times.append(t1 - t0)
return times


if __name__ == '__main__':
parser = optparse.OptionParser(
usage="%prog [options]",
description=("Test the performance of regexps using Fredik Lundh's "
"benchmarks."))
util.add_standard_options_to(parser)
options, args = parser.parse_args()

util.run_benchmark(options, options.num_runs, test_regex_effbot)

Change log

r994 by reid.kleckner on Jan 14, 2010   Diff
Canonicalize the shebang line of our
benchmarking scripts to:
#!/usr/bin/env python
Go to: 
Project members, sign in to write a code review

Older revisions

r899 by collinw on Nov 16, 2009   Diff
Refactor common benchmark code into
util.py; add a --take_geo_mean
aggregation option.
r623 by collinw on Jun 9, 2009   Diff
Incorporate effbot's regex benchmarks
into perf.py.
All revisions of this file

File info

Size: 4659 bytes, 147 lines
Powered by Google Project Hosting