My favorites | Sign in
Project Home Downloads Wiki Issues Source
Repository:
Checkout   Browse   Changes   Clones  
Changes to /src/lepl/lexer/rewriters.py
9321cf753f02 vs. 67579ff784f3 Compare: vs.  Format:
Revision 67579ff784f3
Go to: 
Project members, sign in to write a code review
/src/lepl/lexer/rewriters.py   9321cf753f02 /src/lepl/lexer/rewriters.py   67579ff784f3
1 1
2 # Copyright 2009 Andrew Cooke 2 # Copyright 2009 Andrew Cooke
3 3
4 # This file is part of LEPL. 4 # This file is part of LEPL.
5 # 5 #
6 # LEPL is free software: you can redistribute it and/or modify 6 # LEPL is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Lesser General Public License as published 7 # it under the terms of the GNU Lesser General Public License as published
8 # by the Free Software Foundation, either version 3 of the License, or 8 # by the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version. 9 # (at your option) any later version.
10 # 10 #
11 # LEPL is distributed in the hope that it will be useful, 11 # LEPL is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU Lesser General Public License for more details. 14 # GNU Lesser General Public License for more details.
15 # 15 #
16 # You should have received a copy of the GNU Lesser General Public License 16 # You should have received a copy of the GNU Lesser General Public License
17 # along with LEPL. If not, see <http://www.gnu.org/licenses/>. 17 # along with LEPL. If not, see <http://www.gnu.org/licenses/>.
18 18
19 ''' 19 '''
20 Rewrite a matcher graph to include lexing. 20 Rewrite a matcher graph to include lexing.
21 ''' 21 '''
22 22
23 from collections import deque 23 from collections import deque
24 from logging import getLogger 24 from logging import getLogger
25 25
26 from lepl.lexer.matchers import BaseToken, Lexer, LexerError, NonToken 26 from lepl.lexer.matchers import BaseToken, Lexer, LexerError, NonToken
27 from lepl.operators import Matcher 27 from lepl.operators import Matcher
28 from lepl.regexp.unicode import UnicodeAlphabet 28 from lepl.regexp.unicode import UnicodeAlphabet
29 29
30 30
31 def find_tokens(matcher): 31 def find_tokens(matcher):
32 ''' 32 '''
33 Returns a set of Tokens. Also asserts that children of tokens are 33 Returns a set of Tokens. Also asserts that children of tokens are
34 not themselves Tokens. 34 not themselves Tokens.
35 35
36 Should we also check that a Token occurs somewhere on every path to a 36 Should we also check that a Token occurs somewhere on every path to a
37 leaf node? 37 leaf node?
38 ''' 38 '''
39 (tokens, visited, non_tokens) = (set(), set(), set()) 39 (tokens, visited, non_tokens) = (set(), set(), set())
40 stack = deque([matcher]) 40 stack = deque([matcher])
41 while stack: 41 while stack:
42 matcher = stack.popleft() 42 matcher = stack.popleft()
43 if isinstance(matcher, NonToken): 43 if isinstance(matcher, NonToken):
44 non_tokens.add(matcher) 44 non_tokens.add(matcher)
45 if matcher not in visited: 45 if matcher not in visited:
46 visited.add(matcher) 46 visited.add(matcher)
47 if isinstance(matcher, BaseToken): 47 if isinstance(matcher, BaseToken):
48 tokens.add(matcher) 48 tokens.add(matcher)
49 if matcher.content: 49 if matcher.content:
50 assert_not_token(matcher.content, visited) 50 assert_not_token(matcher.content, visited)
51 else: 51 else:
52 for child in matcher: 52 for child in matcher:
53 if isinstance(child, Matcher): 53 if isinstance(child, Matcher):
54 stack.append(child) 54 stack.append(child)
55 if tokens and non_tokens: 55 if tokens and non_tokens:
56 raise LexerError('The grammar contains a mix of Tokens and non-Token ' 56 raise LexerError('The grammar contains a mix of Tokens and non-Token '
57 'matchers at the top level. If Tokens are used then ' 57 'matchers at the top level. If Tokens are used then '
58 'non-token matchers that consume input must only ' 58 'non-token matchers that consume input must only '
59 'appear "inside" Tokens. The non-Token matchers ' 59 'appear "inside" Tokens. The non-Token matchers '
60 'include: {0}.' 60 'include: {0}.'
61 .format('; '.join(n.__class__.__name__ 61 .format('; '.join(n.__class__.__name__
62 for n in non_tokens))) 62 for n in non_tokens)))
63 return tokens 63 return tokens
64 64
65 65
66 def unique_ids(tokens): 66 #def unique_ids(tokens):
67 ''' 67 # '''
68 Drop tokens with unique IDs (these are "machine added" things like 68 # Drop tokens with unique IDs (these are "machine added" things like
69 indentation). 69 # indentation).
70 ''' 70 # '''
71 log = getLogger('lepl.lexer.rewriters.unique_ids') 71 # log = getLogger('lepl.lexer.rewriters.unique_ids')
72 by_id = {} 72 # by_id = {}
73 for token in tokens: 73 # for token in tokens:
74 if token.id_ in by_id: 74 # if token.id_ in by_id:
75 if type(token.id_) is int: 75 # if type(token.id_) is int:
76 log.warn('Discarding duplicate token: {0}'.format(token)) 76 # log.warn('Discarding duplicate token: {0}'.format(token))
77 else: 77 # else:
78 by_id[token.id_] = token 78 # by_id[token.id_] = token
79 return set(by_id.values()) 79 # return set(by_id.values())
80 80
81 81
82 def assert_not_token(node, visited): 82 def assert_not_token(node, visited):
83 ''' 83 '''
84 Assert that neither this nor any child node is a Token. 84 Assert that neither this nor any child node is a Token.
85 ''' 85 '''
86 if isinstance(node, Matcher) and node not in visited: 86 if isinstance(node, Matcher) and node not in visited:
87 visited.add(node) 87 visited.add(node)
88 if isinstance(node, BaseToken): 88 if isinstance(node, BaseToken):
89 raise LexerError('Nested token: {0}'.format(node)) 89 raise LexerError('Nested token: {0}'.format(node))
90 else: 90 else:
91 for child in node: 91 for child in node:
92 assert_not_token(child, visited) 92 assert_not_token(child, visited)
93 93
94 94
95 def lexer_rewriter(alphabet=None, discard='[ \t\r\n]', extra_tokens=None, 95 def lexer_rewriter(alphabet=None, discard='[ \t\r\n]', extra_tokens=None,
96 source=None): 96 source=None):
97 ''' 97 '''
98 This is required when using Tokens. It does the following: 98 This is required when using Tokens. It does the following:
99 - Find all tokens in the matcher graph 99 - Find all tokens in the matcher graph
100 - Construct a lexer from the tokens 100 - Construct a lexer from the tokens
101 - Connect the lexer to the matcher 101 - Connect the lexer to the matcher
102 - Check that all children have a token parent 102 - Check that all children have a token parent
103 (and optionally add a default token) 103 (and optionally add a default token)
104 Although possibly not in that order. 104 Although possibly not in that order.
105 105
106 alphabet is the alphabet for which the regular expressions are defined. 106 alphabet is the alphabet for which the regular expressions are defined.
107 107
108 discard is a regular expression that is used to match space (typically) 108 discard is a regular expression that is used to match space (typically)
109 if no token can be matched (and which is then discarded) 109 if no token can be matched (and which is then discarded)
110 110
111 error is raised if no token or discard can be matched (it is passed the 111 error is raised if no token or discard can be matched (it is passed the
112 current stream). 112 current stream).
113 113
114 extra_tokens are added to those found by analysing the grammar. 114 extra_tokens are added to those found by analysing the grammar.
115 115
116 source is the source used to generate the final stream. 116 source is the source used to generate the final stream.
117 ''' 117 '''
118 118
119 log = getLogger('lepl.lexer.rewriters.lexer_rewriter') 119 log = getLogger('lepl.lexer.rewriters.lexer_rewriter')
120 120
121 if alphabet is None: 121 if alphabet is None:
122 alphabet = UnicodeAlphabet.instance() 122 alphabet = UnicodeAlphabet.instance()
123 def rewriter(matcher): 123 def rewriter(matcher):
124 ''' 124 '''
125 Either construct the lexer, or warn that none found. 125 Either construct the lexer, or warn that none found.
126 ''' 126 '''
127 tokens = find_tokens(matcher) 127 tokens = find_tokens(matcher)
128 if extra_tokens: 128 if extra_tokens:
129 tokens.update(extra_tokens) 129 tokens.update(extra_tokens)
130 # this breaks specialised tokens 130 # this breaks specialised tokens
131 # tokens = unique_ids(tokens) 131 # tokens = unique_ids(tokens)
132 if tokens: 132 if tokens:
133 return Lexer(matcher, tokens, alphabet, discard, source=source) 133 return Lexer(matcher, tokens, alphabet, discard, source=source)
134 else: 134 else:
135 log.info('Lexer rewriter used, but no tokens found.') 135 log.info('Lexer rewriter used, but no tokens found.')
136 return matcher 136 return matcher
137 return rewriter 137 return rewriter
Powered by Google Project Hosting