| 1 #
2 # Secret Labs' Regular Expression Engine
3 #
4 # re-compatible interface for the sre matching engine
5 #
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
7 #
8 # This version of the SRE library can be redistributed under CNRI's
9 # Python 1.6 license. For any other use, please contact Secret Labs
10 # AB (info@pythonware.com).
11 #
12 # Portions of this engine have been developed in cooperation with
13 # CNRI. Hewlett-Packard provided funding for 1.6 integration and
14 # other compatibility work.
15 #
16
17 r"""Support for regular expressions (RE).
18
19 This module provides regular expression matching operations similar to
20 those found in Perl. It supports both 8-bit and Unicode strings; both
21 the pattern and the strings being processed can contain null bytes and
22 characters outside the US ASCII range.
23
24 Regular expressions can contain both special and ordinary characters.
25 Most ordinary characters, like "A", "a", or "0", are the simplest
26 regular expressions; they simply match themselves. You can
27 concatenate ordinary characters, so last matches the string 'last'.
28
29 The special characters are:
30 "." Matches any character except a newline.
31 "^" Matches the start of the string.
32 "$" Matches the end of the string or just before the newline at
33 the end of the string.
34 "*" Matches 0 or more (greedy) repetitions of the preceding RE.
35 Greedy means that it will match as many repetitions as possible.
36 "+" Matches 1 or more (greedy) repetitions of the preceding RE.
37 "?" Matches 0 or 1 (greedy) of the preceding RE.
38 *?,+?,?? Non-greedy versions of the previous three special characters.
39 {m,n} Matches from m to n repetitions of the preceding RE.
40 {m,n}? Non-greedy version of the above.
41 "\\" Either escapes special characters or signals a special sequence.
42 [] Indicates a set of characters.
43 A "^" as the first character indicates a complementing set.
44 "|" A|B, creates an RE that will match either A or B.
45 (...) Matches the RE inside the parentheses.
46 The contents can be retrieved or matched later in the string.
47 (?aiLmsux) Set the A, I, L, M, S, U, or X flag for the RE (see below).
48 (?:...) Non-grouping version of regular parentheses.
49 (?P<name>...) The substring matched by the group is accessible by name.
50 (?P=name) Matches the text matched earlier by the group named name.
51 (?#...) A comment; ignored.
52 (?=...) Matches if ... matches next, but doesn't consume the string.
53 (?!...) Matches if ... doesn't match next.
54 (?<=...) Matches if preceded by ... (must be fixed length).
55 (?<!...) Matches if not preceded by ... (must be fixed length).
56 (?(id/name)yes|no) Matches yes pattern if the group with id/name matched,
57 the (optional) no pattern otherwise.
58
59 The special sequences consist of "\\" and a character from the list
60 below. If the ordinary character is not on the list, then the
61 resulting RE will match the second character.
62 \number Matches the contents of the group of the same number.
63 \A Matches only at the start of the string.
64 \Z Matches only at the end of the string.
65 \b Matches the empty string, but only at the start or end of a word.
66 \B Matches the empty string, but not at the start or end of a word.
67 \d Matches any decimal digit; equivalent to the set [0-9] in
68 bytes patterns or string patterns with the ASCII flag.
69 In string patterns without the ASCII flag, it will match the whole
70 range of Unicode digits.
71 \D Matches any non-digit character; equivalent to [^\d].
72 \s Matches any whitespace character; equivalent to [ \t\n\r\f\v] in
73 bytes patterns or string patterns with the ASCII flag.
74 In string patterns without the ASCII flag, it will match the whole
75 range of Unicode whitespace characters.
76 \S Matches any non-whitespace character; equivalent to [^\s].
77 \w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]
78 in bytes patterns or string patterns with the ASCII flag.
79 In string patterns without the ASCII flag, it will match the
80 range of Unicode alphanumeric characters (letters plus digits
81 plus underscore).
82 With LOCALE, it will match the set [0-9_] plus characters defined
83 as letters for the current locale.
84 \W Matches the complement of \w.
85 \\ Matches a literal backslash.
86
87 This module exports the following functions:
88 match Match a regular expression pattern to the beginning of a string.
89 search Search a string for the presence of a pattern.
90 sub Substitute occurrences of a pattern found in a string.
91 subn Same as sub, but also return the number of substitutions made.
92 split Split a string by the occurrences of a pattern.
93 findall Find all occurrences of a pattern in a string.
94 finditer Return an iterator yielding a match object for each match.
95 compile Compile a pattern into a RegexObject.
96 purge Clear the regular expression cache.
97 escape Backslash all non-alphanumerics in a string.
98
99 Some of the functions in this module takes flags as optional parameters:
100 A ASCII For string patterns, make \w, \W, \b, \B, \d, \D
101 match the corresponding ASCII character categories
102 (rather than the whole Unicode categories, which is the
103 default).
104 For bytes patterns, this flag is the only available
105 behaviour and needn't be specified.
106 I IGNORECASE Perform case-insensitive matching.
107 L LOCALE Make \w, \W, \b, \B, dependent on the current locale.
108 M MULTILINE "^" matches the beginning of lines (after a newline)
109 as well as the string.
110 "$" matches the end of lines (before a newline) as well
111 as the end of the string.
112 S DOTALL "." matches any character at all, including the newline.
113 X VERBOSE Ignore whitespace and comments for nicer looking RE's.
114 U UNICODE For compatibility only. Ignored for string patterns (it
115 is the default), and forbidden for bytes patterns.
116
117 This module also defines an exception 'error'.
118
119 """
120
121 import sys
122 import sre_compile
123 import sre_parse
124
125 # public symbols
126 __all__ = [ "match", "search", "sub", "subn", "split", "findall",
127 "compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X",
128 "U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
129 "UNICODE", "error" ]
130
131 __version__ = "2.2.1"
132
133 # flags
134 A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale"
135 I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
136 L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
137 U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale"
138 M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
139 S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
140 X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
141
142 # sre extensions (experimental, don't rely on these)
143 T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking
144 DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation
145
146 # sre exception
147 error = sre_compile.error
148
149 # --------------------------------------------------------------------
150 # public interface
151
152 def match(pattern, string, flags=0):
153 """Try to apply the pattern at the start of the string, returning
154 a match object, or None if no match was found."""
155 return _compile(pattern, flags).match(string)
156
157 def search(pattern, string, flags=0):
158 """Scan through string looking for a match to the pattern, returning
159 a match object, or None if no match was found."""
160 return _compile(pattern, flags).search(string)
161
162 def sub(pattern, repl, string, count=0, flags=0):
163 """Return the string obtained by replacing the leftmost
164 non-overlapping occurrences of the pattern in string by the
165 replacement repl. repl can be either a string or a callable;
166 if a string, backslash escapes in it are processed. If it is
167 a callable, it's passed the match object and must return
168 a replacement string to be used."""
169 return _compile(pattern, flags).sub(repl, string, count)
170
171 def subn(pattern, repl, string, count=0, flags=0):
172 """Return a 2-tuple containing (new_string, number).
173 new_string is the string obtained by replacing the leftmost
174 non-overlapping occurrences of the pattern in the source
175 string by the replacement repl. number is the number of
176 substitutions that were made. repl can be either a string or a
177 callable; if a string, backslash escapes in it are processed.
178 If it is a callable, it's passed the match object and must
179 return a replacement string to be used."""
180 return _compile(pattern, flags).subn(repl, string, count)
181
182 def split(pattern, string, maxsplit=0, flags=0):
183 """Split the source string by the occurrences of the pattern,
184 returning a list containing the resulting substrings. If
185 capturing parentheses are used in pattern, then the text of all
186 groups in the pattern are also returned as part of the resulting
187 list. If maxsplit is nonzero, at most maxsplit splits occur,
188 and the remainder of the string is returned as the final element
189 of the list."""
190 return _compile(pattern, flags).split(string, maxsplit)
191
192 def findall(pattern, string, flags=0):
193 """Return a list of all non-overlapping matches in the string.
194
195 If one or more capturing groups are present in the pattern, return
196 a list of groups; this will be a list of tuples if the pattern
197 has more than one group.
198
199 Empty matches are included in the result."""
200 return _compile(pattern, flags).findall(string)
201
202 if sys.hexversion >= 0x02020000:
203 __all__.append("finditer")
204 def finditer(pattern, string, flags=0):
205 """Return an iterator over all non-overlapping matches in the
206 string. For each match, the iterator returns a match object.
207
208 Empty matches are included in the result."""
209 return _compile(pattern, flags).finditer(string)
210
211 def compile(pattern, flags=0):
212 "Compile a regular expression pattern, returning a pattern object."
213 return _compile(pattern, flags)
214
215 def purge():
216 "Clear the regular expression caches"
217 _cache.clear()
218 _cache_repl.clear()
219
220 def template(pattern, flags=0):
221 "Compile a template pattern, returning a pattern object"
222 return _compile(pattern, flags|T)
223
224 _alphanum_str = frozenset(
225 "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890")
226 _alphanum_bytes = frozenset(
227 b"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890")
228
229 def escape(pattern):
230 """
231 Escape all the characters in pattern except ASCII letters, numbers and '_'.
232 """
233 if isinstance(pattern, str):
234 alphanum = _alphanum_str
235 s = list(pattern)
236 for i, c in enumerate(pattern):
237 if c not in alphanum:
238 if c == "\000":
239 s[i] = "\\000"
240 else:
241 s[i] = "\\" + c
242 return "".join(s)
243 else:
244 alphanum = _alphanum_bytes
245 s = []
246 esc = ord(b"\\")
247 for c in pattern:
248 if c in alphanum:
249 s.append(c)
250 else:
251 if c == 0:
252 s.extend(b"\\000")
253 else:
254 s.append(esc)
255 s.append(c)
256 return bytes(s)
257
258 # --------------------------------------------------------------------
259 # internals
260
261 _cache = {}
262 _cache_repl = {}
263
264 _pattern_type = type(sre_compile.compile("", 0))
265
266 _MAXCACHE = 512
267 def _compile(pattern, flags):
268 # internal: compile pattern
269 try:
270 return _cache[type(pattern), pattern, flags]
271 except KeyError:
272 pass
273 if isinstance(pattern, _pattern_type):
274 if flags:
275 raise ValueError(
276 "Cannot process flags argument with a compiled pattern")
277 return pattern
278 if not sre_compile.isstring(pattern):
279 raise TypeError("first argument must be string or compiled pattern")
280 p = sre_compile.compile(pattern, flags)
281 if len(_cache) >= _MAXCACHE:
282 _cache.clear()
283 _cache[type(pattern), pattern, flags] = p
284 return p
285
286 def _compile_repl(repl, pattern):
287 # internal: compile replacement pattern
288 try:
289 return _cache_repl[repl, pattern]
290 except KeyError:
291 pass
292 p = sre_parse.parse_template(repl, pattern)
293 if len(_cache_repl) >= _MAXCACHE:
294 _cache_repl.clear()
295 _cache_repl[repl, pattern] = p
296 return p
297
298 def _expand(pattern, match, template):
299 # internal: match.expand implementation hook
300 template = sre_parse.parse_template(template, pattern)
301 return sre_parse.expand_template(template, match)
302
303 def _subx(pattern, template):
304 # internal: pattern.sub/subn implementation helper
305 template = _compile_repl(template, pattern)
306 if not template[0] and len(template[1]) == 1:
307 # literal replacement
308 return template[1][0]
309 def filter(match, template=template):
310 return sre_parse.expand_template(template, match)
311 return filter
312
313 # register myself for pickling
314
315 import copyreg
316
317 def _pickle(p):
318 return _compile, (p.pattern, p.flags)
319
320 copyreg.pickle(_pattern_type, _pickle, _compile)
321
322 # --------------------------------------------------------------------
323 # experimental stuff (see python-dev discussions for details)
324
325 class Scanner:
326 def __init__(self, lexicon, flags=0):
327 from sre_constants import BRANCH, SUBPATTERN
328 self.lexicon = lexicon
329 # combine phrases into a compound pattern
330 p = []
331 s = sre_parse.Pattern()
332 s.flags = flags
333 for phrase, action in lexicon:
334 p.append(sre_parse.SubPattern(s, [
335 (SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))),
336 ]))
337 s.groups = len(p)+1
338 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
339 self.scanner = sre_compile.compile(p)
340 def scan(self, string):
341 result = []
342 append = result.append
343 match = self.scanner.scanner(string).match
344 i = 0
345 while 1:
346 m = match()
347 if not m:
348 break
349 j = m.end()
350 if i == j:
351 break
352 action = self.lexicon[m.lastindex-1][1]
353 if callable(action):
354 self.match = m
355 action = action(self, m.group())
356 if action is not None:
357 append(action)
358 i = j
359 return result, string[i:]
|