1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945
|
import sys
from pypy.interpreter.baseobjspace import W_Root
from pypy.interpreter.typedef import GetSetProperty, TypeDef
from pypy.interpreter.typedef import interp_attrproperty, interp_attrproperty_w
from pypy.interpreter.typedef import make_weakref_descr
from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault
from pypy.interpreter.error import OperationError, oefmt
from pypy.objspace.std.util import generic_alias_class_getitem
from rpython.rlib.objectmodel import compute_hash
from rpython.rlib.rarithmetic import intmask, int_between
from rpython.rlib import jit, rutf8
from rpython.rlib.rstring import StringBuilder
# ____________________________________________________________
#
# Constants and exposed functions
from rpython.rlib.rsre import rsre_core, rsre_char, rsre_utf8, rsre_constants as consts
from rpython.rlib.rsre.rsre_char import CODESIZE, MAXREPEAT, MAXGROUPS, set_unicode_db
from rpython.rlib.rsre.rpy.sre_constants import OPCODES
ORDERED_OPCODE_NAMES = [name for name, value in sorted(OPCODES.items(), key=lambda element: element[1])]
@unwrap_spec(character=int)
def w_ascii_iscased(space, character):
return space.newbool(rsre_char.iscased_ascii(character))
@unwrap_spec(character=int)
def w_unicode_iscased(space, character):
return space.newbool(rsre_char.iscased_unicode(character))
@unwrap_spec(character=int)
def w_ascii_tolower(space, character):
return space.newint(rsre_char.getlower_ascii(character))
@unwrap_spec(character=int)
def w_unicode_tolower(space, character):
return space.newint(rsre_char.getlower_unicode(character))
def w_getcodesize(space):
return space.newint(CODESIZE)
# use the same version of unicodedb as the standard objspace
import pypy.objspace.std.unicodeobject
set_unicode_db(pypy.objspace.std.unicodeobject.unicodedb)
# ____________________________________________________________
#
class UnicodeAsciiMatchContext(rsre_core.StrMatchContext):
# we make a subclass just to mark that it originates from a W_UnicodeObject
pass
def slice_w(space, ctx, start, end, w_default):
# 'start' and 'end' are byte positions
if ctx.ZERO <= start <= end:
if isinstance(ctx, rsre_core.BufMatchContext):
length = ctx._buffer.getlength()
start = min(start, length)
end = min(end, length)
return space.newbytes(ctx._buffer.getslice(start, 1,
end-start))
elif isinstance(ctx, UnicodeAsciiMatchContext):
s = ctx._string[start:end]
return space.newutf8(s, len(s))
elif isinstance(ctx, rsre_core.StrMatchContext):
start = ctx._real_pos(start)
end = ctx._real_pos(end)
return space.newbytes(ctx._string[start:end])
elif isinstance(ctx, rsre_utf8.Utf8MatchContext):
s = ctx._utf8[start:end]
lgt = rutf8.codepoints_in_utf8(s)
return space.newutf8(s, lgt)
else:
# unreachable
raise SystemError
return w_default
@jit.look_inside_iff(lambda ctx, num_groups: jit.isconstant(num_groups))
def do_flatten_marks(ctx, num_groups):
# Returns a list of RPython-level integers.
# Unlike the app-level groups() method, groups are numbered from 0
# and the returned list does not start with the whole match range.
# The integers are byte positions, not character indexes (for utf8).
if num_groups == 0:
return None
result = [-1] * (2 * num_groups)
mark = ctx.match_marks
while mark is not None:
index = jit.promote(mark.gid)
if result[index] == -1:
result[index] = mark.position
mark = mark.prev
return result
@jit.look_inside_iff(lambda space, ctx, fmarks, num_groups, w_default: jit.isconstant(num_groups))
def allgroups_w(space, ctx, fmarks, num_groups, w_default):
grps = [slice_w(space, ctx, fmarks[i * 2], fmarks[i * 2 + 1], w_default)
for i in range(num_groups)]
return space.newtuple(grps)
def import_re(space):
return space.fromcache(AppReCache).get_re()
def matchcontext(space, ctx, pattern):
try:
return rsre_core.match_context(ctx, pattern)
except rsre_core.Error as e:
raise OperationError(space.w_RuntimeError, space.newtext(e.msg))
def searchcontext(space, ctx, pattern):
try:
return rsre_core.search_context(ctx, pattern)
except rsre_core.Error as e:
raise OperationError(space.w_RuntimeError, space.newtext(e.msg))
# ____________________________________________________________
#
# SRE_Pattern class
FLAG_NAMES = ["re.TEMPLATE", "re.IGNORECASE", "re.LOCALE", "re.MULTILINE",
"re.DOTALL", "re.UNICODE", "re.VERBOSE", "re.DEBUG",
"re.ASCII"]
class AppReCache(object):
def __init__(self, space):
self.space = space
self.w_re = None
def get_re(self):
if self.w_re is not None:
return self.w_re
space = self.space
w_import = space.getattr(space.builtin, space.newtext("__import__"))
w_re = space.call_function(w_import, space.newtext("re"))
self.w_re = w_re
return w_re
class W_SRE_Pattern(W_Root):
_immutable_fields_ = ["code", "flags", "num_groups", "w_groupindex"]
def copy_identity_w(self, args_w):
return self
def repr_w(self):
space = self.space
w_s = space.repr(self.w_pattern)
w_s = space.getslice(w_s, space.newint(0), space.newint(200))
u = space.utf8_w(w_s)
flag_items = []
flags = self.flags
if self.is_known_unicode():
if ((flags & (consts.SRE_FLAG_LOCALE |
consts.SRE_FLAG_UNICODE |
256)) # consts.SRE_FLAG_ASCII
== consts.SRE_FLAG_UNICODE):
flags &= ~consts.SRE_FLAG_UNICODE
for i, name in enumerate(FLAG_NAMES):
if flags & (1 << i):
flags -= (1 << i)
flag_items.append(name)
if flags != 0:
flag_items.append('0x%x' % flags)
if len(flag_items) == 0:
usep = ''
uflags = ''
else:
usep = ', '
uflags = '|'.join(flag_items)
return space.newtext('re.compile(%s%s%s)' % (u, usep, uflags))
def descr_eq(self, space, w_other):
if not isinstance(w_other, W_SRE_Pattern):
return space.w_NotImplemented
other = w_other
# Compare the code and the pattern because the same pattern can
# produce different codes depending on the locale used to compile the
# pattern when the re.LOCALE flag is used. Don't compare groups,
# indexgroup nor groupindex: they are derivated from the pattern.
return space.newbool(
self.flags == other.flags and
self.code.pattern == other.code.pattern and
space.eq_w(self.w_pattern, other.w_pattern))
def descr_hash(self, space):
from rpython.rlib.rarithmetic import intmask
x = 0x345678
for c in self.code.pattern:
x = intmask((1000003 * x) ^ c)
x = intmask((1000003 * x) ^ self.flags)
x = intmask((1000003 * x) ^ space.hash_w(self.w_pattern))
return space.newint(x)
def fget_groupindex(self, space):
w_groupindex = self.w_groupindex
if space.isinstance_w(w_groupindex, space.w_dict):
w_groupindex = space.newdictproxy(w_groupindex)
return w_groupindex
def is_known_bytes(self):
space = self.space
if space.is_none(self.w_pattern):
return False
return not space.isinstance_w(self.w_pattern, space.w_unicode)
def is_known_unicode(self):
space = self.space
if space.is_none(self.w_pattern):
return False
return space.isinstance_w(self.w_pattern, space.w_unicode)
def make_ctx(self, w_string, pos=0, endpos=sys.maxint):
"""Make a StrMatchContext, BufMatchContext or a Utf8MatchContext for
searching in the given w_string object."""
space = self.space
if pos < 0:
pos = 0
if endpos < pos:
endpos = pos
if space.isinstance_w(w_string, space.w_unicode):
if self.is_known_bytes():
raise oefmt(space.w_TypeError,
"can't use a bytes pattern on a string-like "
"object")
w_unicode_obj = space.convert_arg_to_w_unicode(w_string)
utf8str = w_unicode_obj._utf8
length = w_unicode_obj._len()
if pos <= 0:
bytepos = 0
elif pos >= length:
bytepos = len(utf8str)
else:
bytepos = w_unicode_obj._index_to_byte(pos)
if endpos >= length:
endbytepos = len(utf8str)
else:
endbytepos = w_unicode_obj._index_to_byte(endpos)
if w_unicode_obj.is_ascii():
ctx = UnicodeAsciiMatchContext(
utf8str, bytepos, endbytepos)
else:
ctx = rsre_utf8.Utf8MatchContext(
utf8str, bytepos, endbytepos)
# we store the w_string on the ctx too, for
# W_SRE_Match.bytepos_to_charindex()
ctx.w_unicode_obj = w_unicode_obj
return ctx
elif self.is_known_unicode():
raise oefmt(space.w_TypeError,
"can't use a string pattern on a bytes-like "
"object")
elif space.isinstance_w(w_string, space.w_bytes):
string = space.bytes_w(w_string)
length = len(string)
if pos > length:
pos = length
if endpos > length:
endpos = length
return rsre_core.StrMatchContext(string, pos, endpos)
else:
buf = space.readbuf_w(w_string)
size = buf.getlength()
assert size >= 0
if pos > size:
pos = size
if endpos > size:
endpos = size
return rsre_core.BufMatchContext(buf, pos, endpos)
def fresh_copy(self, ctx):
if isinstance(ctx, rsre_utf8.Utf8MatchContext):
result = rsre_utf8.Utf8MatchContext(
ctx._utf8, ctx.match_start, ctx.end)
result.w_unicode_obj = ctx.w_unicode_obj
elif isinstance(ctx, UnicodeAsciiMatchContext):
result = UnicodeAsciiMatchContext(
ctx._string, ctx.match_start, ctx.end)
elif isinstance(ctx, rsre_core.StrMatchContext):
result = self._make_str_match_context(
ctx._string, ctx.match_start, ctx.end)
elif isinstance(ctx, rsre_core.BufMatchContext):
result = rsre_core.BufMatchContext(
ctx._buffer, ctx.match_start, ctx.end)
else:
raise AssertionError("bad ctx type")
result.match_end = ctx.match_end
return result
def _make_str_match_context(self, str, pos, endpos):
# for tests to override
return rsre_core.StrMatchContext(str,
pos, endpos)
def getmatch(self, ctx, found, w_string):
if found:
return W_SRE_Match(self, ctx, w_string)
else:
return self.space.w_None
@unwrap_spec(pos=int, endpos=int)
def match_w(self, w_string, pos=0, endpos=sys.maxint):
"""Matches zero or more characters at the beginning of the string."""
ctx = self.make_ctx(w_string, pos, endpos)
return self.getmatch(ctx, matchcontext(self.space, ctx, self.code), w_string)
@unwrap_spec(pos=int, endpos=int)
def fullmatch_w(self, w_string, pos=0, endpos=sys.maxint):
"""Matches against all of the string."""
ctx = self.make_ctx(w_string, pos, endpos)
ctx.match_mode = rsre_core.MODE_FULL
return self.getmatch(ctx, matchcontext(self.space, ctx, self.code), w_string)
@unwrap_spec(pos=int, endpos=int)
def search_w(self, w_string, pos=0, endpos=sys.maxint):
"""Scan through string looking for a match, and return a corresponding match object instance.
Return None if no position in the string matches."""
ctx = self.make_ctx(w_string, pos, endpos)
return self.getmatch(ctx, searchcontext(self.space, ctx, self.code), w_string)
@unwrap_spec(pos=int, endpos=int)
def findall_w(self, w_string, pos=0, endpos=sys.maxint):
"""Return a list of all non-overlapping matches of pattern in string."""
space = self.space
matchlist_w = []
ctx = self.make_ctx(w_string, pos, endpos)
while True:
if not searchcontext(space, ctx, self.code):
break
num_groups = self.num_groups
w_emptystr = space.newtext("")
if num_groups == 0:
w_item = slice_w(space, ctx, ctx.match_start, ctx.match_end,
w_emptystr)
else:
fmarks = do_flatten_marks(ctx, num_groups)
if num_groups == 1:
w_item = slice_w(space, ctx, fmarks[0], fmarks[1],
w_emptystr)
else:
w_item = allgroups_w(space, ctx, fmarks, num_groups,
w_emptystr)
matchlist_w.append(w_item)
ctx.reset(ctx.match_end, ctx.match_start == ctx.match_end)
return space.newlist(matchlist_w)
@unwrap_spec(pos=int, endpos=int)
def finditer_w(self, w_string, pos=0, endpos=sys.maxint):
"""Return an iterator over all non-overlapping matches for the RE pattern in string.
For each match, the iterator returns a match object."""
# this also works as the implementation of the undocumented
# scanner() method.
ctx = self.make_ctx(w_string, pos, endpos)
scanner = W_SRE_Scanner(self, ctx, self.code, w_string)
return scanner
@unwrap_spec(maxsplit=int)
def split_w(self, w_string, maxsplit=0):
"""Split string by the occurrences of pattern."""
space = self.space
#
splitlist = []
n = 0
ctx = self.make_ctx(w_string)
last = ctx.ZERO
while not maxsplit or n < maxsplit:
pattern = self.code
num_groups = self.num_groups
split_jitdriver.jit_merge_point(
pattern=pattern, num_groups=num_groups, ctx_type=type(ctx))
if not searchcontext(space, ctx, pattern):
break
splitlist.append(slice_w(space, ctx, last, ctx.match_start,
space.w_None))
# add groups (if any)
fmarks = do_flatten_marks(ctx, num_groups)
for groupnum in range(num_groups):
groupstart, groupend = fmarks[groupnum*2], fmarks[groupnum*2+1]
splitlist.append(slice_w(space, ctx, groupstart, groupend,
space.w_None))
n += 1
last = ctx.match_end
ctx.reset(last, ctx.match_start == last)
splitlist.append(slice_w(space, ctx, last, ctx.end, space.w_None))
return space.newlist(splitlist)
@unwrap_spec(count=int)
def sub_w(self, w_repl, w_string, count=0):
"""Return the string obtained by replacing the leftmost non-overlapping occurrences of pattern in string by the replacement repl."""
w_item, n = self.subx(w_repl, w_string, count)
return w_item
@unwrap_spec(count=int)
def subn_w(self, w_repl, w_string, count=0):
"""Return the tuple (new_string, number_of_subs_made) found by replacing the leftmost non-overlapping occurrences of pattern with the replacement repl."""
w_item, n = self.subx(w_repl, w_string, count)
space = self.space
return space.newtuple2(w_item, space.newint(n))
def subx(self, w_ptemplate, w_string, count):
space = self.space
# use a (much faster) string builder (possibly utf8) if w_ptemplate and
# w_string are both string or both unicode objects, and if w_ptemplate
# is a literal
use_builder = '\x00' # or 'S'tring or 'U'nicode/UTF8
is_buffer = False
filter_as_string = None
if space.isinstance_w(w_string, space.w_unicode):
if not self.is_known_unicode():
raise oefmt(space.w_TypeError,
"cannot use a bytes pattern on a string-like object")
else:
if self.is_known_unicode():
raise oefmt(space.w_TypeError,
"cannot use a string pattern on a bytes-like object")
if space.is_true(space.callable(w_ptemplate)):
w_filter = w_ptemplate
filter_is_callable = True
else:
if space.isinstance_w(w_ptemplate, space.w_unicode):
filter_as_string = space.utf8_w(w_ptemplate)
literal = '\\' not in filter_as_string
if space.isinstance_w(w_string, space.w_unicode) and literal:
use_builder = 'U'
elif space.isinstance_w(w_ptemplate, space.w_bytes):
filter_as_string = space.bytes_w(w_ptemplate)
literal = '\\' not in filter_as_string
if space.isinstance_w(w_string, space.w_bytes) and literal:
use_builder = 'S'
else:
if space.isinstance_w(w_ptemplate, space.w_bytes):
filter_as_string = space.bytes_w(w_ptemplate)
else:
filter_as_string = space.readbuf_w(w_ptemplate).as_str()
is_buffer = True
literal = '\\' not in filter_as_string
if space.isinstance_w(w_string, space.w_bytes) and literal:
use_builder = 'S'
if literal:
w_filter = w_ptemplate
filter_is_callable = False
else:
# not a literal; hand it over to the template compiler
# FIX for a CPython 3.5 bug: if w_ptemplate is a buffer
# (e.g. a bytearray), convert it to a byte string here.
if is_buffer:
w_ptemplate = space.newbytes(filter_as_string)
w_re = import_re(space)
w_filter = space.call_method(w_re, '_subx',
self, w_ptemplate)
filter_is_callable = space.is_true(space.callable(w_filter))
#
ctx = self.make_ctx(w_string)
ctx_end = ctx.end
pattern = self.code
# shortcut if the pattern doesn't occur at all (relatively common, eg
# when escaping things)
if not searchcontext(space, ctx, pattern):
w_typ = space.type(w_string)
if w_typ is not space.w_unicode and w_typ is not space.w_bytes:
w_string = slice_w(space, ctx, ctx.ZERO, ctx_end, space.w_None)
return w_string, 0
# XXX this is a bit of a mess, but it improves performance a lot
sublist_w = strbuilder = None
if use_builder != '\x00':
assert filter_as_string is not None
strbuilder = StringBuilder(ctx_end)
else:
sublist_w = []
n = 0
last_pos = ctx.ZERO
while not count or n < count:
# on entering this loop for the first time, we have already
# performed one match above
space = self.space
if last_pos < ctx.match_start:
_sub_append_slice(
ctx, space, use_builder, sublist_w,
strbuilder, last_pos, ctx.match_start)
if 1: # keeps the following block indented
last_pos = ctx.match_end
if filter_is_callable:
w_match = self.getmatch(ctx, True, w_string)
# make a copy of 'ctx'; see test_sub_matches_stay_valid
ctx = self.fresh_copy(ctx)
w_piece = space.call_function(w_filter, w_match)
if not space.is_w(w_piece, space.w_None):
assert strbuilder is None
assert use_builder == '\x00'
sublist_w.append(w_piece)
else:
if use_builder != '\x00':
assert filter_as_string is not None
assert strbuilder is not None
strbuilder.append(filter_as_string)
else:
sublist_w.append(w_filter)
n += 1
start = ctx.match_end
ctx.reset(start, ctx.match_start == start)
sub_jitdriver.jit_merge_point(
use_builder=use_builder,
filter_is_callable=filter_is_callable,
filter_type=type(w_filter),
pattern=pattern,
)
if not searchcontext(space, ctx, pattern):
break
if last_pos < ctx.end:
_sub_append_slice(ctx, space, use_builder, sublist_w,
strbuilder, last_pos, ctx.end)
if use_builder != '\x00':
assert strbuilder is not None
result_bytes = strbuilder.build()
if use_builder == 'S':
assert not isinstance(ctx, rsre_utf8.Utf8MatchContext)
return space.newbytes(result_bytes), n
elif use_builder == 'U':
assert (isinstance(ctx, UnicodeAsciiMatchContext) or
isinstance(ctx, rsre_utf8.Utf8MatchContext))
return space.newutf8(result_bytes,
rutf8.codepoints_in_utf8(result_bytes)), n
else:
raise AssertionError(use_builder)
else:
if space.isinstance_w(w_string, space.w_unicode):
w_emptystr = space.newutf8('', 0)
else:
w_emptystr = space.newbytes('')
w_item = space.call_method(w_emptystr, 'join',
space.newlist(sublist_w))
return w_item, n
def sub_get_printable_location(filter_is_callable, use_builder, filter_type, pattern):
s = str(pattern.pattern)
if len(s) > 120:
s = s[:110] + '...'
if use_builder == '\x00':
use_builder = 'list'
else:
use_builder = "%sBuilder" % use_builder
return "re.sub %s %s %s %s" % (s, filter_is_callable, use_builder, filter_type)
sub_jitdriver = jit.JitDriver(
reds="auto",
greens=["filter_is_callable", "use_builder", "filter_type", "pattern"],
get_printable_location=sub_get_printable_location,
)
def split_get_printable_location(num_groups, ctx_type, pattern):
s = str(pattern.pattern)
if len(s) > 120:
s = s[:110] + '...'
return "re.split %s %s %s" % (s, num_groups, ctx_type)
split_jitdriver = jit.JitDriver(
reds="auto",
greens=["num_groups", "ctx_type", "pattern"],
get_printable_location=split_get_printable_location,
)
def _sub_append_slice(ctx, space, use_builder, sublist_w,
strbuilder, start, end):
if use_builder != '\x00':
assert strbuilder is not None
if isinstance(ctx, rsre_core.BufMatchContext):
assert use_builder == 'S'
return strbuilder.append(ctx._buffer.getslice(start, 1, end-start))
if isinstance(ctx, UnicodeAsciiMatchContext):
assert use_builder == 'U'
return strbuilder.append_slice(ctx._string, start, end)
if isinstance(ctx, rsre_core.StrMatchContext):
assert use_builder == 'S'
start = ctx._real_pos(start)
end = ctx._real_pos(end)
return strbuilder.append_slice(ctx._string, start, end)
if isinstance(ctx, rsre_utf8.Utf8MatchContext):
assert use_builder == 'U'
return strbuilder.append_slice(ctx._utf8, start, end)
assert 0, "unreachable"
else:
sublist_w.append(slice_w(space, ctx, start, end, space.w_None))
@unwrap_spec(flags=int, groups=int, w_groupindex=WrappedDefault(None),
w_indexgroup=WrappedDefault(None))
def SRE_Pattern__new__(space, w_subtype, w_pattern, flags, w_code,
groups=0, w_groupindex=None, w_indexgroup=None):
n = space.len_w(w_code)
code = [intmask(space.uint_w(space.getitem(w_code, space.newint(i))))
for i in range(n)]
#
w_srepat = space.allocate_instance(W_SRE_Pattern, w_subtype)
srepat = space.interp_w(W_SRE_Pattern, w_srepat)
srepat.space = space
# Type check
if not (space.is_none(w_pattern) or
space.isinstance_w(w_pattern, space.w_unicode)):
space.readbuf_w(w_pattern)
srepat.w_pattern = w_pattern # the original uncompiled pattern
srepat.flags = flags
# note: we assume that the app-level is caching SRE_Pattern objects,
# so that we don't need to do it here. Creating new SRE_Pattern
# objects all the time would be bad for the JIT, which relies on the
# identity of the CompiledPattern() object.
srepat.code = rsre_core.CompiledPattern(code, flags)
srepat.num_groups = groups
srepat.w_groupindex = w_groupindex
srepat.w_indexgroup = w_indexgroup
return w_srepat
W_SRE_Pattern.typedef = TypeDef(
're.Pattern',
__doc__ = "Compiled regular expression object.",
__new__ = interp2app(SRE_Pattern__new__),
__copy__ = interp2app(W_SRE_Pattern.copy_identity_w),
__deepcopy__ = interp2app(W_SRE_Pattern.copy_identity_w),
__repr__ = interp2app(W_SRE_Pattern.repr_w),
__weakref__ = make_weakref_descr(W_SRE_Pattern),
__eq__ = interp2app(W_SRE_Pattern.descr_eq),
__hash__ = interp2app(W_SRE_Pattern.descr_hash),
findall = interp2app(W_SRE_Pattern.findall_w),
finditer = interp2app(W_SRE_Pattern.finditer_w),
match = interp2app(W_SRE_Pattern.match_w),
fullmatch = interp2app(W_SRE_Pattern.fullmatch_w),
scanner = interp2app(W_SRE_Pattern.finditer_w), # reuse finditer()
search = interp2app(W_SRE_Pattern.search_w),
split = interp2app(W_SRE_Pattern.split_w),
sub = interp2app(W_SRE_Pattern.sub_w),
subn = interp2app(W_SRE_Pattern.subn_w),
flags = interp_attrproperty('flags', W_SRE_Pattern,
wrapfn="newint"),
groupindex = GetSetProperty(W_SRE_Pattern.fget_groupindex),
groups = interp_attrproperty('num_groups', W_SRE_Pattern,
wrapfn="newint"),
pattern = interp_attrproperty_w('w_pattern', W_SRE_Pattern),
__class_getitem__ = interp2app(
generic_alias_class_getitem, as_classmethod=True),
)
W_SRE_Pattern.typedef.acceptable_as_base_class = False
# ____________________________________________________________
#
# SRE_Match class
class W_SRE_Match(W_Root):
flatten_cache = None
def __init__(self, srepat, ctx, w_string):
self.space = srepat.space
self.srepat = srepat
self.ctx = ctx
self.w_string = w_string
def repr_w(self):
space = self.space
ctx = self.ctx
start, end = ctx.match_start, ctx.match_end
w_s = slice_w(space, ctx, start, end, space.w_None)
# follow the same logic as CPython, repr'ing the whole match
# before deciding to throw away everything beyond the 50th unichar
# (which will leave the opening quote unbalanced, and might be
# cutting in the middle of a '\' escape)
w_s = space.repr(w_s)
w_s = space.getslice(w_s, space.newint(0), space.newint(50))
u = space.utf8_w(w_s)
start = self.bytepos_to_charindex(start)
end = self.bytepos_to_charindex(end)
return space.newtext('<re.Match object; span=(%d, %d), match=%s>' %
(start, end, u))
def copy_identity_w(self, args_w):
return self
def descr_getitem(self, space, w_index):
start, end = self.do_span(w_index)
return slice_w(space, self.ctx, start, end, space.w_None)
@jit.look_inside_iff(lambda self, args_w: jit.isconstant(len(args_w)))
def group_w(self, args_w):
"""group([group1, ...]) -> str or tuple.
Return subgroup(s) of the match by indices or names.
For 0 returns the entire match."""
space = self.space
ctx = self.ctx
if len(args_w) <= 1:
if len(args_w) == 0:
start, end = ctx.match_start, ctx.match_end
else:
start, end = self.do_span(args_w[0])
return slice_w(space, ctx, start, end, space.w_None)
else:
results = [None] * len(args_w)
for i in range(len(args_w)):
start, end = self.do_span(args_w[i])
results[i] = slice_w(space, ctx, start, end, space.w_None)
return space.newtuple(results)
@unwrap_spec(w_default=WrappedDefault(None))
def groups_w(self, w_default=None):
fmarks = self.flatten_marks()
num_groups = self.srepat.num_groups
return allgroups_w(self.space, self.ctx, fmarks, num_groups, w_default)
@unwrap_spec(w_default=WrappedDefault(None))
def groupdict_w(self, w_default=None):
space = self.space
w_dict = space.newdict()
w_groupindex = self.srepat.w_groupindex
w_iterator = space.iter(w_groupindex)
while True:
try:
w_key = space.next(w_iterator)
except OperationError as e:
if not e.match(space, space.w_StopIteration):
raise
break # done
w_value = space.getitem(w_groupindex, w_key)
start, end = self.do_span(w_value)
w_grp = slice_w(space, self.ctx, start, end, w_default)
space.setitem(w_dict, w_key, w_grp)
return w_dict
def expand_w(self, w_template):
space = self.space
w_re = import_re(space)
return space.call_method(w_re, '_expand', self.srepat,
self, w_template)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def start_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
start = self.bytepos_to_charindex(start)
return self.space.newint(start)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def end_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
end = self.bytepos_to_charindex(end)
return self.space.newint(end)
@unwrap_spec(w_groupnum=WrappedDefault(0))
def span_w(self, w_groupnum):
start, end = self.do_span(w_groupnum)
return self.new_charindex_tuple(start, end)
def new_charindex_tuple(self, start, end):
start = self.bytepos_to_charindex(start)
end = self.bytepos_to_charindex(end)
return self.space.newtuple2(self.space.newint(start),
self.space.newint(end))
def bytepos_to_charindex(self, bytepos):
# Transform a 'byte position', as returned by all methods from
# rsre_core, back into a 'character index'. This is for UTF8
# handling.
ctx = self.ctx
if isinstance(ctx, rsre_utf8.Utf8MatchContext):
return ctx.w_unicode_obj._byte_to_index(bytepos)
else:
return bytepos
def flatten_marks(self):
if self.flatten_cache is None:
num_groups = self.srepat.num_groups
self.flatten_cache = do_flatten_marks(self.ctx, num_groups)
return self.flatten_cache
def do_span(self, w_arg):
# return a pair of integers, which are byte positions, not
# character indexes (for utf8)
space = self.space
try:
groupnum = space.getindex_w(w_arg, space.w_OverflowError)
except OperationError as e:
if not e.match(space, space.w_TypeError) and \
not e.match(space, space.w_OverflowError):
raise
try:
w_groupnum = space.getitem(self.srepat.w_groupindex, w_arg)
except OperationError as e:
if not e.match(space, space.w_KeyError):
raise
raise oefmt(space.w_IndexError, "no such group")
groupnum = space.int_w(w_groupnum)
if groupnum == 0:
return self.ctx.match_start, self.ctx.match_end
elif 1 <= groupnum <= self.srepat.num_groups:
fmarks = self.flatten_marks()
idx = 2*(groupnum-1)
assert idx >= 0
return fmarks[idx], fmarks[idx+1]
else:
raise oefmt(space.w_IndexError, "no such group")
def _last_index(self):
mark = self.ctx.match_marks
if mark is not None:
return mark.gid // 2 + 1
return -1
def fget_lastgroup(self, space):
lastindex = self._last_index()
if lastindex < 0:
return space.w_None
w_result = space.finditem(self.srepat.w_indexgroup,
space.newint(lastindex))
if w_result is None:
return space.w_None
return w_result
def fget_lastindex(self, space):
lastindex = self._last_index()
if lastindex >= 0:
return space.newint(lastindex)
return space.w_None
def fget_pos(self, space):
return space.newint(self.bytepos_to_charindex(self.ctx.original_pos))
def fget_endpos(self, space):
return space.newint(self.bytepos_to_charindex(self.ctx.end))
def fget_regs(self, space):
space = self.space
fmarks = self.flatten_marks()
num_groups = self.srepat.num_groups
result_w = [None] * (num_groups + 1)
ctx = self.ctx
result_w[0] = self.new_charindex_tuple(ctx.match_start,
ctx.match_end)
for i in range(num_groups):
result_w[i + 1] = self.new_charindex_tuple(fmarks[i*2],
fmarks[i*2+1])
return space.newtuple(result_w)
def fget_string(self, space):
return self.w_string
W_SRE_Match.typedef = TypeDef(
're.Match',
__doc__ = """The result of re.match() and re.search().
Match objects always have a boolean value of True.""",
__copy__ = interp2app(W_SRE_Match.copy_identity_w),
__deepcopy__ = interp2app(W_SRE_Match.copy_identity_w),
__repr__ = interp2app(W_SRE_Match.repr_w),
__getitem__ = interp2app(W_SRE_Match.descr_getitem),
#
group = interp2app(W_SRE_Match.group_w),
groups = interp2app(W_SRE_Match.groups_w),
groupdict = interp2app(W_SRE_Match.groupdict_w),
start = interp2app(W_SRE_Match.start_w),
end = interp2app(W_SRE_Match.end_w),
span = interp2app(W_SRE_Match.span_w),
expand = interp2app(W_SRE_Match.expand_w),
#
re = interp_attrproperty_w('srepat', W_SRE_Match),
string = GetSetProperty(W_SRE_Match.fget_string),
pos = GetSetProperty(W_SRE_Match.fget_pos),
endpos = GetSetProperty(W_SRE_Match.fget_endpos),
lastgroup = GetSetProperty(W_SRE_Match.fget_lastgroup),
lastindex = GetSetProperty(W_SRE_Match.fget_lastindex),
regs = GetSetProperty(W_SRE_Match.fget_regs),
__class_getitem__ = interp2app(
generic_alias_class_getitem, as_classmethod=True),
)
W_SRE_Match.typedef.acceptable_as_base_class = False
# ____________________________________________________________
#
# SRE_Scanner class
# This is mostly an internal class in CPython.
# Our version is also directly iterable, to make finditer() easier.
class W_SRE_Scanner(W_Root):
def __init__(self, pattern, ctx, code, w_string):
self.space = pattern.space
self.srepat = pattern
self.ctx = ctx
self.code = code
self.w_string = w_string
# 'self.ctx' is always a fresh context in which no searching
# or matching succeeded so far. It is None when the iterator is
# exhausted.
def iter_w(self):
return self
def next_w(self):
if self.ctx is None:
raise OperationError(self.space.w_StopIteration, self.space.w_None)
if not searchcontext(self.space, self.ctx, self.code):
raise OperationError(self.space.w_StopIteration, self.space.w_None)
return self.getmatch(True)
def match_w(self):
if self.ctx is None:
return self.space.w_None
return self.getmatch(matchcontext(self.space, self.ctx, self.code))
def search_w(self):
if self.ctx is None:
return self.space.w_None
return self.getmatch(searchcontext(self.space, self.ctx, self.code))
def getmatch(self, found):
ctx = self.ctx
assert ctx is not None
if found:
thisstart = ctx.match_start
nextstart = ctx.match_end
self.ctx = self.srepat.fresh_copy(ctx)
self.ctx.reset(nextstart, thisstart == nextstart)
match = W_SRE_Match(self.srepat, ctx, self.w_string)
return match
else:
self.ctx = None
return None
W_SRE_Scanner.typedef = TypeDef(
'_sre.SRE_Scanner',
__iter__ = interp2app(W_SRE_Scanner.iter_w),
__next__ = interp2app(W_SRE_Scanner.next_w),
match = interp2app(W_SRE_Scanner.match_w),
search = interp2app(W_SRE_Scanner.search_w),
pattern = interp_attrproperty_w('srepat', W_SRE_Scanner),
)
W_SRE_Scanner.typedef.acceptable_as_base_class = False
|