My favorites | Sign in
Project Home Downloads Wiki Issues Source
Repository:
Checkout   Browse   Changes   Clones  
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

# Copyright 2009 Andrew Cooke

# This file is part of LEPL.
#
# LEPL is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# LEPL is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with LEPL. If not, see <http://www.gnu.org/licenses/>.

'''
Some intermediate classes that support parsers for objects that can be
converted to strings using str().
'''

from lepl.regexp.core import Alphabet, Character, Sequence, Choice, Repeat, \
Option
from lepl.config import Configuration



def make_str_parser(alphabet):
'''
Construct a parser for string based expressions.

We need a clear policy on backslashes. To be as backwards compatible as
possible I am going with:

0. "Escaping" means prefixing with \.

1. These characters are special: {, }, [, ], -, \, (, ), *, ?, ., +,
^, $, |.

2. Special characters (ie literal, or unescaped special characters) may
not have a meaning currently, or may only have a meaning in certain
contexts.

3. To use a special character literally, it must be escaped.

4. If a special character is used without an escape, in a context
where it doesn't have a meaning, then it is an error.

5. If a non-special character is escaped, that is also an error.

This is not the same as the Python convention, but I believe it makes
automatic escaping of given text easier.
'''

# Avoid dependency loops
from lepl.functions import Drop, Eos, AnyBut
from lepl.matchers import Any, Lookahead, Literal, Delayed

dup = lambda x: (alphabet.from_char(x), alphabet.from_char(x))
tup = lambda x: (alphabet.from_char(x[0]), alphabet.from_char(x[1]))
dot = lambda x: (alphabet.min, alphabet.max)
# Character needed here to ensure intervals passed to invert are ordered
invert = lambda x: alphabet.invert(Character(x, alphabet))
sequence = lambda x: Sequence(x, alphabet)
repeat = lambda x: Repeat(x, alphabet)
repeat2 = lambda x: sequence([sequence(x), Repeat(x, alphabet)])
option = lambda x: Option(x, alphabet)
choice = lambda x: Choice(x, alphabet)
character = lambda x: Character(x, alphabet)

# these two definitions enforce the conditions above, providing only
# special characters appear as literals in the grammar
escaped = Drop(alphabet.escape) + Any(alphabet.escaped)
raw = ~Lookahead(alphabet.escape) + AnyBut(alphabet.escaped)

single = escaped | raw

any_ = Literal('.') >> dot
letter = single >> dup
pair = single & Drop('-') & single > tup

interval = pair | letter
brackets = Drop('[') & interval[1:] & Drop(']')
inverted = Drop('[^') & interval[1:] & Drop(']') >= invert
char = inverted | brackets | letter | any_ > character

item = Delayed()

seq = (char | item)[0:] > sequence
group = Drop('(') & seq & Drop(')')
alts = Drop('(') & seq[2:, Drop('|')] & Drop(')') > choice
star = (alts | group | char) & Drop('*') > repeat
plus = (alts | group | char) & Drop('+') > repeat2
opt = (alts | group | char) & Drop('?') > option

item += alts | group | star | plus | opt

expr = (char | item)[:] & Drop(Eos())

# Empty config here avoids loops if the default config includes
# references to alphabets
return expr.string_parser(config=Configuration())


class StrAlphabet(Alphabet):
'''
An alphabet for unicode strings.
'''

# pylint: disable-msg=E1002
# (pylint bug? this chains back to a new style abc)
def __init__(self, min_, max_, escape='\\', escaped='{}[]*()-?.+\\^$|',
parser_factory=make_str_parser):
super(StrAlphabet, self).__init__(min_, max_)
self.__escape = escape
self.__escaped = escaped
self._parser = parser_factory(self)

@property
def escape(self):
return self.__escape

@property
def escaped(self):
return self.__escaped

def _escape_char(self, char):
'''
Escape a character if necessary.
'''
if self.escape is not None and str(char) in self.escaped:
return self.escape + str(char)
else:
return str(char)

def fmt_intervals(self, intervals):
'''
This must fully describe the data in the intervals (it is used to
hash the data).
'''
ranges = []
if len(intervals) == 1:
if intervals[0][0] == intervals[0][1]:
return self._escape_char(intervals[0][0])
elif intervals[0][0] == self.min and intervals[0][1] == self.max:
return '.'
if len(intervals) > 1 and intervals[0][0] == self.min:
intervals = self.invert(intervals)
hat = '^'
else:
hat = ''
# pylint: disable-msg=C0103
# (sorry. but i use this (a, b) convention throughout the regexp lib)
for (a, b) in intervals:
if a == b:
ranges.append(self._escape_char(a))
else:
ranges.append('{0!s}-{1!s}'.format(
self._escape_char(a), self._escape_char(b)))
return '[{0}{1}]'.format(hat, self.join(ranges))

def fmt_sequence(self, children):
'''
Generate a string representation of a sequence.

This must fully describe the data in the children (it is used to
hash the data).
'''
return self.join(str(c) for c in children)

def fmt_repeat(self, children):
'''
Generate a string representation of a repetition.

This must fully describe the data in the children (it is used to
hash the data).
'''
string = self.fmt_sequence(children)
if len(children) == 1 and type(children[0]) in (Character, Choice):
return string + '*'
else:
return '({0})*'.format(string)

def fmt_choice(self, children):
'''
Generate a string representation of a choice.

This must fully describe the data in the children (it is used to
hash the data).
'''
return '({0})'.format('|'.join(self.fmt_sequence(child)
for child in children))

def fmt_option(self, children):
'''
Generate a string representation of an option.

This must fully describe the data in the children (it is used to
hash the data).
'''
string = self.fmt_sequence(children)
if len(children) == 1 and type(children[0]) in (Character, Choice):
return string + '?'
else:
return '({0})?'.format(string)

def join(self, chars):
'''
Join characters together.
'''
return ''.join(chars)

def from_char(self, char):
'''
This must convert a single character.
'''
return char

def parse(self, regexp):
'''
Generate a Sequence from the given text.
'''
return self._parser(regexp)

Change log

67579ff784f3 by andrew cooke <and...@acooke.org> on Sep 18, 2009   Diff
pylint fixes
Go to: 
Project members, sign in to write a code review

Older revisions

31ec121f30de by andrew cooke <and...@acooke.org> on Aug 22, 2009   Diff
started added support for sol/eol
3dce73864b5d by andrew cooke <and...@acooke.org> on Jul 5, 2009   Diff
splitting functions from matchers;
adding columns
66fa8c6a9f56 by andrew cooke <and...@acooke.org> on Jul 2, 2009   Diff
working through code with pylint
All revisions of this file

File info

Size: 7935 bytes, 225 lines
Powered by Google Project Hosting