1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
|
--- code/trunk/pcre_exec.c 2013/07/02 18:37:36 1346
+++ code/trunk/pcre_exec.c 2013/07/26 10:03:38 1350
@@ -5637,7 +5637,7 @@
}
}
- /* Match extended Unicode sequences. We will get here only if the
+ /* Match extended Unicode grapheme clusters. We will get here only if the
support is in the binary; otherwise a compile-time error occurs. */
else if (ctype == OP_EXTUNI)
@@ -5670,21 +5670,41 @@
/* eptr is now past the end of the maximum run */
if (possessive) continue; /* No backtracking */
+
for(;;)
{
- if (eptr == pp) goto TAIL_RECURSE;
+ int lgb, rgb;
+ PCRE_PUCHAR fptr;
+
+ if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
+
+ /* Backtracking over an extended grapheme cluster involves inspecting
+ the previous two characters (if present) to see if a break is
+ permitted between them. */
+
eptr--;
- for (;;) /* Move back over one extended */
+ if (!utf) c = *eptr; else
+ {
+ BACKCHAR(eptr);
+ GETCHAR(c, eptr);
+ }
+ rgb = UCD_GRAPHBREAK(c);
+
+ for (;;)
{
- if (!utf) c = *eptr; else
+ if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
+ fptr = eptr - 1;
+ if (!utf) c = *fptr; else
{
- BACKCHAR(eptr);
- GETCHAR(c, eptr);
+ BACKCHAR(fptr);
+ GETCHAR(c, fptr);
}
- if (UCD_CATEGORY(c) != ucp_M) break;
- eptr--;
+ lgb = UCD_GRAPHBREAK(c);
+ if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
+ eptr = fptr;
+ rgb = lgb;
}
}
}
|