1 # 2 # Secret Labs' Regular Expression Engine 3 # 4 # convert re-style regular expression to sre pattern 5 # 6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 7 # 8 # See the sre.py file for information on usage and redistribution. 9 # 10 11 """Internal support module for sre""" 12 13 # XXX: show string offset and offending character for all errors 14 15 import sys 16 17 from sre_constants import * 18 from _sre import MAXREPEAT 19 20 SPECIAL_CHARS = ".\\[{()*+?^$|" 21 REPEAT_CHARS = "*+?{" 22 23 DIGITS = set("0123456789") 24 25 OCTDIGITS = set("01234567") 26 HEXDIGITS = set("0123456789abcdefABCDEF") 27 28 WHITESPACE = set(" \t\n\r\v\f") 29 30 ESCAPES = { 31 r"\a": (LITERAL, ord("\a")), 32 r"\b": (LITERAL, ord("\b")), 33 r"\f": (LITERAL, ord("\f")), 34 r"\n": (LITERAL, ord("\n")), 35 r"\r": (LITERAL, ord("\r")), 36 r"\t": (LITERAL, ord("\t")), 37 r"\v": (LITERAL, ord("\v")), 38 r"\\": (LITERAL, ord("\\")) 39 } 40 41 CATEGORIES = { 42 r"\A": (AT, AT_BEGINNING_STRING), # start of string 43 r"\b": (AT, AT_BOUNDARY), 44 r"\B": (AT, AT_NON_BOUNDARY), 45 r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), 46 r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), 47 r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), 48 r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), 49 r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), 50 r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), 51 r"\Z": (AT, AT_END_STRING), # end of string 52 } 53 54 FLAGS = { 55 # standard flags 56 "i": SRE_FLAG_IGNORECASE, 57 "L": SRE_FLAG_LOCALE, 58 "m": SRE_FLAG_MULTILINE, 59 "s": SRE_FLAG_DOTALL, 60 "x": SRE_FLAG_VERBOSE, 61 # extensions 62 "a": SRE_FLAG_ASCII, 63 "t": SRE_FLAG_TEMPLATE, 64 "u": SRE_FLAG_UNICODE, 65 } 66 67 class Pattern: 68 # master pattern object. keeps track of global attributes 69 def __init__(self): 70 self.flags = 0 71 self.open = [] 72 self.groups = 1 73 self.groupdict = {} 74 def opengroup(self, name=None): 75 gid = self.groups 76 self.groups = gid + 1 77 if name is not None: 78 ogid = self.groupdict.get(name, None) 79 if ogid is not None: 80 raise error("redefinition of group name %s as group %d; " 81 "was group %d" % (repr(name), gid, ogid)) 82 self.groupdict[name] = gid 83 self.open.append(gid) 84 return gid 85 def closegroup(self, gid): 86 self.open.remove(gid) 87 def checkgroup(self, gid): 88 return gid < self.groups and gid not in self.open 89 90 class SubPattern: 91 # a subpattern, in intermediate form 92 def __init__(self, pattern, data=None): 93 self.pattern = pattern 94 if data is None: 95 data = [] 96 self.data = data 97 self.width = None 98 def dump(self, level=0): 99 nl = 1 100 seqtypes = (tuple, list) 101 for op, av in self.data: 102 print(level*" " + op, end=' '); nl = 0 103 if op == "in": 104 # member sublanguage 105 print(); nl = 1 106 for op, a in av: 107 print((level+1)*" " + op, a) 108 elif op == "branch": 109 print(); nl = 1 110 i = 0 111 for a in av[1]: 112 if i > 0: 113 print(level*" " + "or") 114 a.dump(level+1); nl = 1 115 i = i + 1 116 elif isinstance(av, seqtypes): 117 for a in av: 118 if isinstance(a, SubPattern): 119 if not nl: print() 120 a.dump(level+1); nl = 1 121 else: 122 print(a, end=' ') ; nl = 0 123 else: 124 print(av, end=' ') ; nl = 0 125 if not nl: print() 126 def __repr__(self): 127 return repr(self.data) 128 def __len__(self): 129 return len(self.data) 130 def __delitem__(self, index): 131 del self.data[index] 132 def __getitem__(self, index): 133 if isinstance(index, slice): 134 return SubPattern(self.pattern, self.data[index]) 135 return self.data[index] 136 def __setitem__(self, index, code): 137 self.data[index] = code 138 def insert(self, index, code): 139 self.data.insert(index, code) 140 def append(self, code): 141 self.data.append(code) 142 def getwidth(self): 143 # determine the width (min, max) for this subpattern 144 if self.width: 145 return self.width 146 lo = hi = 0 147 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) 148 REPEATCODES = (MIN_REPEAT, MAX_REPEAT) 149 for op, av in self.data: 150 if op is BRANCH: 151 i = MAXREPEAT - 1 152 j = 0 153 for av in av[1]: 154 l, h = av.getwidth() 155 i = min(i, l) 156 j = max(j, h) 157 lo = lo + i 158 hi = hi + j 159 elif op is CALL: 160 i, j = av.getwidth() 161 lo = lo + i 162 hi = hi + j 163 elif op is SUBPATTERN: 164 i, j = av[1].getwidth() 165 lo = lo + i 166 hi = hi + j 167 elif op in REPEATCODES: 168 i, j = av[2].getwidth() 169 lo = lo + i * av[0] 170 hi = hi + j * av[1] 171 elif op in UNITCODES: 172 lo = lo + 1 173 hi = hi + 1 174 elif op == SUCCESS: 175 break 176 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) 177 return self.width 178 179 class Tokenizer: 180 def __init__(self, string): 181 self.istext = isinstance(string, str) 182 self.string = string 183 self.index = 0 184 self.__next() 185 def __next(self): 186 if self.index >= len(self.string): 187 self.next = None 188 return 189 char = self.string[self.index:self.index+1] 190 # Special case for the str8, since indexing returns a integer 191 # XXX This is only needed for test_bug_926075 in test_re.py 192 if char and not self.istext: 193 char = chr(char[0]) 194 if char == "\\": 195 try: 196 c = self.string[self.index + 1] 197 except IndexError: 198 raise error("bogus escape (end of line)") 199 if not self.istext: 200 c = chr(c) 201 char = char + c 202 self.index = self.index + len(char) 203 self.next = char 204 def match(self, char, skip=1): 205 if char == self.next: 206 if skip: 207 self.__next() 208 return 1 209 return 0 210 def get(self): 211 this = self.next 212 self.__next() 213 return this 214 def getwhile(self, n, charset): 215 result = '' 216 for _ in range(n): 217 c = self.next 218 if c not in charset: 219 break 220 result += c 221 self.__next() 222 return result 223 def tell(self): 224 return self.index, self.next 225 def seek(self, index): 226 self.index, self.next = index 227 228 # The following three functions are not used in this module anymore, but we keep 229 # them here (with DeprecationWarnings) for backwards compatibility. 230 231 def isident(char): 232 import warnings 233 warnings.warn('sre_parse.isident() will be removed in 3.5', 234 DeprecationWarning, stacklevel=2) 235 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" 236 237 def isdigit(char): 238 import warnings 239 warnings.warn('sre_parse.isdigit() will be removed in 3.5', 240 DeprecationWarning, stacklevel=2) 241 return "0" <= char <= "9" 242 243 def isname(name): 244 import warnings 245 warnings.warn('sre_parse.isname() will be removed in 3.5', 246 DeprecationWarning, stacklevel=2) 247 # check that group name is a valid string 248 if not isident(name[0]): 249 return False 250 for char in name[1:]: 251 if not isident(char) and not isdigit(char): 252 return False 253 return True 254 255 def _class_escape(source, escape): 256 # handle escape code inside character class 257 code = ESCAPES.get(escape) 258 if code: 259 return code 260 code = CATEGORIES.get(escape) 261 if code and code[0] == IN: 262 return code 263 try: 264 c = escape[1:2] 265 if c == "x": 266 # hexadecimal escape (exactly two digits) 267 escape += source.getwhile(2, HEXDIGITS) 268 if len(escape) != 4: 269 raise ValueError 270 return LITERAL, int(escape[2:], 16) & 0xff 271 elif c == "u" and source.istext: 272 # unicode escape (exactly four digits) 273 escape += source.getwhile(4, HEXDIGITS) 274 if len(escape) != 6: 275 raise ValueError 276 return LITERAL, int(escape[2:], 16) 277 elif c == "U" and source.istext: 278 # unicode escape (exactly eight digits) 279 escape += source.getwhile(8, HEXDIGITS) 280 if len(escape) != 10: 281 raise ValueError 282 c = int(escape[2:], 16) 283 chr(c) # raise ValueError for invalid code 284 return LITERAL, c 285 elif c in OCTDIGITS: 286 # octal escape (up to three digits) 287 escape += source.getwhile(2, OCTDIGITS) 288 return LITERAL, int(escape[1:], 8) & 0xff 289 elif c in DIGITS: 290 raise ValueError 291 if len(escape) == 2: 292 return LITERAL, ord(escape[1]) 293 except ValueError: 294 pass 295 raise error("bogus escape: %s" % repr(escape)) 296 297 def _escape(source, escape, state): 298 # handle escape code in expression 299 code = CATEGORIES.get(escape) 300 if code: 301 return code 302 code = ESCAPES.get(escape) 303 if code: 304 return code 305 try: 306 c = escape[1:2] 307 if c == "x": 308 # hexadecimal escape 309 escape += source.getwhile(2, HEXDIGITS) 310 if len(escape) != 4: 311 raise ValueError 312 return LITERAL, int(escape[2:], 16) & 0xff 313 elif c == "u" and source.istext: 314 # unicode escape (exactly four digits) 315 escape += source.getwhile(4, HEXDIGITS) 316 if len(escape) != 6: 317 raise ValueError 318 return LITERAL, int(escape[2:], 16) 319 elif c == "U" and source.istext: 320 # unicode escape (exactly eight digits) 321 escape += source.getwhile(8, HEXDIGITS) 322 if len(escape) != 10: 323 raise ValueError 324 c = int(escape[2:], 16) 325 chr(c) # raise ValueError for invalid code 326 return LITERAL, c 327 elif c == "0": 328 # octal escape 329 escape += source.getwhile(2, OCTDIGITS) 330 return LITERAL, int(escape[1:], 8) & 0xff 331 elif c in DIGITS: 332 # octal escape *or* decimal group reference (sigh) 333 if source.next in DIGITS: 334 escape = escape + source.get() 335 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 336 source.next in OCTDIGITS): 337 # got three octal digits; this is an octal escape 338 escape = escape + source.get() 339 return LITERAL, int(escape[1:], 8) & 0xff 340 # not an octal escape, so this is a group reference 341 group = int(escape[1:]) 342 if group < state.groups: 343 if not state.checkgroup(group): 344 raise error("cannot refer to open group") 345 return GROUPREF, group 346 raise ValueError 347 if len(escape) == 2: 348 return LITERAL, ord(escape[1]) 349 except ValueError: 350 pass 351 raise error("bogus escape: %s" % repr(escape)) 352 353 def _parse_sub(source, state, nested=1): 354 # parse an alternation: a|b|c 355 356 items = [] 357 itemsappend = items.append 358 sourcematch = source.match 359 while 1: 360 itemsappend(_parse(source, state)) 361 if sourcematch("|"): 362 continue 363 if not nested: 364 break 365 if not source.next or sourcematch(")", 0): 366 break 367 else: 368 raise error("pattern not properly closed") 369 370 if len(items) == 1: 371 return items[0] 372 373 subpattern = SubPattern(state) 374 subpatternappend = subpattern.append 375 376 # check if all items share a common prefix 377 while 1: 378 prefix = None 379 for item in items: 380 if not item: 381 break 382 if prefix is None: 383 prefix = item[0] 384 elif item[0] != prefix: 385 break 386 else: 387 # all subitems start with a common "prefix". 388 # move it out of the branch 389 for item in items: 390 del item[0] 391 subpatternappend(prefix) 392 continue # check next one 393 break 394 395 # check if the branch can be replaced by a character set 396 for item in items: 397 if len(item) != 1 or item[0][0] != LITERAL: 398 break 399 else: 400 # we can store this as a character set instead of a 401 # branch (the compiler may optimize this even more) 402 set = [] 403 setappend = set.append 404 for item in items: 405 setappend(item[0]) 406 subpatternappend((IN, set)) 407 return subpattern 408 409 subpattern.append((BRANCH, (None, items))) 410 return subpattern 411 412 def _parse_sub_cond(source, state, condgroup): 413 item_yes = _parse(source, state) 414 if source.match("|"): 415 item_no = _parse(source, state) 416 if source.match("|"): 417 raise error("conditional backref with more than two branches") 418 else: 419 item_no = None 420 if source.next and not source.match(")", 0): 421 raise error("pattern not properly closed") 422 subpattern = SubPattern(state) 423 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 424 return subpattern 425 426 _PATTERNENDERS = set("|)") 427 _ASSERTCHARS = set("=!<") 428 _LOOKBEHINDASSERTCHARS = set("=!") 429 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) 430 431 def _parse(source, state): 432 # parse a simple pattern 433 subpattern = SubPattern(state) 434 435 # precompute constants into local variables 436 subpatternappend = subpattern.append 437 sourceget = source.get 438 sourcematch = source.match 439 _len = len 440 PATTERNENDERS = _PATTERNENDERS 441 ASSERTCHARS = _ASSERTCHARS 442 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS 443 REPEATCODES = _REPEATCODES 444 445 while 1: 446 447 if source.next in PATTERNENDERS: 448 break # end of subpattern 449 this = sourceget() 450 if this is None: 451 break # end of pattern 452 453 if state.flags & SRE_FLAG_VERBOSE: 454 # skip whitespace and comments 455 if this in WHITESPACE: 456 continue 457 if this == "#": 458 while 1: 459 this = sourceget() 460 if this in (None, "\n"): 461 break 462 continue 463 464 if this and this[0] not in SPECIAL_CHARS: 465 subpatternappend((LITERAL, ord(this))) 466 467 elif this == "[": 468 # character set 469 set = [] 470 setappend = set.append 471 ## if sourcematch(":"): 472 ## pass # handle character classes 473 if sourcematch("^"): 474 setappend((NEGATE, None)) 475 # check remaining characters 476 start = set[:] 477 while 1: 478 this = sourceget() 479 if this == "]" and set != start: 480 break 481 elif this and this[0] == "\\": 482 code1 = _class_escape(source, this) 483 elif this: 484 code1 = LITERAL, ord(this) 485 else: 486 raise error("unexpected end of regular expression") 487 if sourcematch("-"): 488 # potential range 489 this = sourceget() 490 if this == "]": 491 if code1[0] is IN: 492 code1 = code1[1][0] 493 setappend(code1) 494 setappend((LITERAL, ord("-"))) 495 break 496 elif this: 497 if this[0] == "\\": 498 code2 = _class_escape(source, this) 499 else: 500 code2 = LITERAL, ord(this) 501 if code1[0] != LITERAL or code2[0] != LITERAL: 502 raise error("bad character range") 503 lo = code1[1] 504 hi = code2[1] 505 if hi < lo: 506 raise error("bad character range") 507 setappend((RANGE, (lo, hi))) 508 else: 509 raise error("unexpected end of regular expression") 510 else: 511 if code1[0] is IN: 512 code1 = code1[1][0] 513 setappend(code1) 514 515 # XXX: <fl> should move set optimization to compiler! 516 if _len(set)==1 and set[0][0] is LITERAL: 517 subpatternappend(set[0]) # optimization 518 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 519 subpatternappend((NOT_LITERAL, set[1][1])) # optimization 520 else: 521 # XXX: <fl> should add charmap optimization here 522 subpatternappend((IN, set)) 523 524 elif this and this[0] in REPEAT_CHARS: 525 # repeat previous item 526 if this == "?": 527 min, max = 0, 1 528 elif this == "*": 529 min, max = 0, MAXREPEAT 530 531 elif this == "+": 532 min, max = 1, MAXREPEAT 533 elif this == "{": 534 if source.next == "}": 535 subpatternappend((LITERAL, ord(this))) 536 continue 537 here = source.tell() 538 min, max = 0, MAXREPEAT 539 lo = hi = "" 540 while source.next in DIGITS: 541 lo = lo + source.get() 542 if sourcematch(","): 543 while source.next in DIGITS: 544 hi = hi + sourceget() 545 else: 546 hi = lo 547 if not sourcematch("}"): 548 subpatternappend((LITERAL, ord(this))) 549 source.seek(here) 550 continue 551 if lo: 552 min = int(lo) 553 if min >= MAXREPEAT: 554 raise OverflowError("the repetition number is too large") 555 if hi: 556 max = int(hi) 557 if max >= MAXREPEAT: 558 raise OverflowError("the repetition number is too large") 559 if max < min: 560 raise error("bad repeat interval") 561 else: 562 raise error("not supported") 563 # figure out which item to repeat 564 if subpattern: 565 item = subpattern[-1:] 566 else: 567 item = None 568 if not item or (_len(item) == 1 and item[0][0] == AT): 569 raise error("nothing to repeat") 570 if item[0][0] in REPEATCODES: 571 raise error("multiple repeat") 572 if sourcematch("?"): 573 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 574 else: 575 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 576 577 elif this == ".": 578 subpatternappend((ANY, None)) 579 580 elif this == "(": 581 group = 1 582 name = None 583 condgroup = None 584 if sourcematch("?"): 585 group = 0 586 # options 587 if sourcematch("P"): 588 # python extensions 589 if sourcematch("<"): 590 # named group: skip forward to end of name 591 name = "" 592 while 1: 593 char = sourceget() 594 if char is None: 595 raise error("unterminated name") 596 if char == ">": 597 break 598 name = name + char 599 group = 1 600 if not name: 601 raise error("missing group name") 602 if not name.isidentifier(): 603 raise error("bad character in group name %r" % name) 604 elif sourcematch("="): 605 # named backreference 606 name = "" 607 while 1: 608 char = sourceget() 609 if char is None: 610 raise error("unterminated name") 611 if char == ")": 612 break 613 name = name + char 614 if not name: 615 raise error("missing group name") 616 if not name.isidentifier(): 617 raise error("bad character in backref group name " 618 "%r" % name) 619 gid = state.groupdict.get(name) 620 if gid is None: 621 raise error("unknown group name") 622 subpatternappend((GROUPREF, gid)) 623 continue 624 else: 625 char = sourceget() 626 if char is None: 627 raise error("unexpected end of pattern") 628 raise error("unknown specifier: ?P%s" % char) 629 elif sourcematch(":"): 630 # non-capturing group 631 group = 2 632 elif sourcematch("#"): 633 # comment 634 while 1: 635 if source.next is None or source.next == ")": 636 break 637 sourceget() 638 if not sourcematch(")"): 639 raise error("unbalanced parenthesis") 640 continue 641 elif source.next in ASSERTCHARS: 642 # lookahead assertions 643 char = sourceget() 644 dir = 1 645 if char == "<": 646 if source.next not in LOOKBEHINDASSERTCHARS: 647 raise error("syntax error") 648 dir = -1 # lookbehind 649 char = sourceget() 650 p = _parse_sub(source, state) 651 if not sourcematch(")"): 652 raise error("unbalanced parenthesis") 653 if char == "=": 654 subpatternappend((ASSERT, (dir, p))) 655 else: 656 subpatternappend((ASSERT_NOT, (dir, p))) 657 continue 658 elif sourcematch("("): 659 # conditional backreference group 660 condname = "" 661 while 1: 662 char = sourceget() 663 if char is None: 664 raise error("unterminated name") 665 if char == ")": 666 break 667 condname = condname + char 668 group = 2 669 if not condname: 670 raise error("missing group name") 671 if condname.isidentifier(): 672 condgroup = state.groupdict.get(condname) 673 if condgroup is None: 674 raise error("unknown group name") 675 else: 676 try: 677 condgroup = int(condname) 678 except ValueError: 679 raise error("bad character in group name") 680 else: 681 # flags 682 if not source.next in FLAGS: 683 raise error("unexpected end of pattern") 684 while source.next in FLAGS: 685 state.flags = state.flags | FLAGS[sourceget()] 686 if group: 687 # parse group contents 688 if group == 2: 689 # anonymous group 690 group = None 691 else: 692 group = state.opengroup(name) 693 if condgroup: 694 p = _parse_sub_cond(source, state, condgroup) 695 else: 696 p = _parse_sub(source, state) 697 if not sourcematch(")"): 698 raise error("unbalanced parenthesis") 699 if group is not None: 700 state.closegroup(group) 701 subpatternappend((SUBPATTERN, (group, p))) 702 else: 703 while 1: 704 char = sourceget() 705 if char is None: 706 raise error("unexpected end of pattern") 707 if char == ")": 708 break 709 raise error("unknown extension") 710 711 elif this == "^": 712 subpatternappend((AT, AT_BEGINNING)) 713 714 elif this == "$": 715 subpattern.append((AT, AT_END)) 716 717 elif this and this[0] == "\\": 718 code = _escape(source, this, state) 719 subpatternappend(code) 720 721 else: 722 raise error("parser error") 723 724 return subpattern 725 726 def fix_flags(src, flags): 727 # Check and fix flags according to the type of pattern (str or bytes) 728 if isinstance(src, str): 729 if not flags & SRE_FLAG_ASCII: 730 flags |= SRE_FLAG_UNICODE 731 elif flags & SRE_FLAG_UNICODE: 732 raise ValueError("ASCII and UNICODE flags are incompatible") 733 else: 734 if flags & SRE_FLAG_UNICODE: 735 raise ValueError("can't use UNICODE flag with a bytes pattern") 736 return flags 737 738 def parse(str, flags=0, pattern=None): 739 # parse 're' pattern into list of (opcode, argument) tuples 740 741 source = Tokenizer(str) 742 743 if pattern is None: 744 pattern = Pattern() 745 pattern.flags = flags 746 pattern.str = str 747 748 p = _parse_sub(source, pattern, 0) 749 p.pattern.flags = fix_flags(str, p.pattern.flags) 750 751 tail = source.get() 752 if tail == ")": 753 raise error("unbalanced parenthesis") 754 elif tail: 755 raise error("bogus characters at end of regular expression") 756 757 if flags & SRE_FLAG_DEBUG: 758 p.dump() 759 760 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: 761 # the VERBOSE flag was switched on inside the pattern. to be 762 # on the safe side, we'll parse the whole thing again... 763 return parse(str, p.pattern.flags) 764 765 return p 766 767 def parse_template(source, pattern): 768 # parse 're' replacement string into list of literals and 769 # group references 770 s = Tokenizer(source) 771 sget = s.get 772 groups = [] 773 literals = [] 774 literal = [] 775 lappend = literal.append 776 def addgroup(index): 777 if literal: 778 literals.append(''.join(literal)) 779 del literal[:] 780 groups.append((len(literals), index)) 781 literals.append(None) 782 while True: 783 this = sget() 784 if this is None: 785 break # end of replacement string 786 if this[0] == "\\": 787 # group 788 c = this[1] 789 if c == "g": 790 name = "" 791 if s.match("<"): 792 while True: 793 char = sget() 794 if char is None: 795 raise error("unterminated group name") 796 if char == ">": 797 break 798 name += char 799 if not name: 800 raise error("missing group name") 801 try: 802 index = int(name) 803 if index < 0: 804 raise error("negative group number") 805 except ValueError: 806 if not name.isidentifier(): 807 raise error("bad character in group name") 808 try: 809 index = pattern.groupindex[name] 810 except KeyError: 811 raise IndexError("unknown group name") 812 addgroup(index) 813 elif c == "0": 814 if s.next in OCTDIGITS: 815 this += sget() 816 if s.next in OCTDIGITS: 817 this += sget() 818 lappend(chr(int(this[1:], 8) & 0xff)) 819 elif c in DIGITS: 820 isoctal = False 821 if s.next in DIGITS: 822 this += sget() 823 if (c in OCTDIGITS and this[2] in OCTDIGITS and 824 s.next in OCTDIGITS): 825 this += sget() 826 isoctal = True 827 lappend(chr(int(this[1:], 8) & 0xff)) 828 if not isoctal: 829 addgroup(int(this[1:])) 830 else: 831 try: 832 this = chr(ESCAPES[this][1]) 833 except KeyError: 834 pass 835 lappend(this) 836 else: 837 lappend(this) 838 if literal: 839 literals.append(''.join(literal)) 840 if not isinstance(source, str): 841 # The tokenizer implicitly decodes bytes objects as latin-1, we must 842 # therefore re-encode the final representation. 843 literals = [None if s is None else s.encode('latin-1') for s in literals] 844 return groups, literals 845 846 def expand_template(template, match): 847 g = match.group 848 sep = match.string[:0] 849 groups, literals = template 850 literals = literals[:] 851 try: 852 for index, group in groups: 853 literals[index] = s = g(group) 854 if s is None: 855 raise error("unmatched group") 856 except IndexError: 857 raise error("invalid group reference") 858 return sep.join(literals) |