From 876c834068637115bf028de9fc242afd1ed7e53a Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Wed, 16 Sep 2015 14:34:31 -0600
Subject: PATCH [perl #123562] Regexp-matching "hangs"

The regex engine got into an infinite loop because of the malformation.
It is trying to back-up over a sequence of UTF-8 continuation bytes.
But the character just before the sequence should be a start byte.  If
not, there is a malformation.  I added a test to croak if that isn't the
case so that it doesn't just infinitely loop.  I did this also in the
similar areas of regexec.c.

Comments long ago added to the code suggested that we check for
malformations in the vicinity of the new tests.  But that was never
done.  These new tests should be good enough to prevent looping, anyway.

(cherry picked from commit 22b433eff9a1ffa2454e18405a56650f07b385b5)

Bug: https://rt.perl.org/Ticket/Display.html?id=123562
Bug-Debian: https://bugs.debian.org/821848
Origin: http://perl5.git.perl.org/perl.git/shortlog/refs/heads/smoke-me/rt_123562_520
Patch-Name: fixes/CVE-2015-8853_regexp_hang.diff
---
 regexec.c  | 12 ++++++++++++
 t/re/pat.t | 19 ++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/regexec.c b/regexec.c
index 66f6e04966..ee6705aebc 100644
--- a/regexec.c
+++ b/regexec.c
@@ -7830,6 +7830,10 @@ S_reghop3(U8 *s, SSize_t off, const U8* lim)
             if (UTF8_IS_CONTINUED(*s)) {
                 while (s > lim && UTF8_IS_CONTINUATION(*s))
                     s--;
+                if (! UTF8_IS_START(*s)) {
+                    dTHX;
+                    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+                }
 	    }
             /* XXX could check well-formedness here */
 	}
@@ -7856,6 +7860,10 @@ S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
             if (UTF8_IS_CONTINUED(*s)) {
                 while (s > llim && UTF8_IS_CONTINUATION(*s))
                     s--;
+                if (! UTF8_IS_START(*s)) {
+                    dTHX;
+                    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+                }
             }
             /* XXX could check well-formedness here */
         }
@@ -7887,6 +7895,10 @@ S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
             if (UTF8_IS_CONTINUED(*s)) {
                 while (s > lim && UTF8_IS_CONTINUATION(*s))
                     s--;
+                if (! UTF8_IS_START(*s)) {
+                    dTHX;
+                    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+                }
 	    }
             /* XXX could check well-formedness here */
 	}
diff --git a/t/re/pat.t b/t/re/pat.t
index 7965f4e67d..6e694d7436 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -20,7 +20,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 726;  # Update this when adding/deleting tests.
+plan tests => 727;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1602,6 +1602,23 @@ EOP
 		ok(1, "did not crash");
 		ok($match, "[bbb...] resolved as character class, not subscript");
 	}
+
+        {   # Test that we handle some malformed UTF-8 without looping [perl
+            # #123562]
+
+            my $code='
+                BEGIN{require q(test.pl);}
+                use Encode qw(_utf8_on);
+                my $malformed = "a\x80\n";
+                _utf8_on($malformed);
+                watchdog(3);
+                $malformed =~ /(\n\r|\r)$/;
+                print q(No infinite loop here!);
+            ';
+            fresh_perl_like($code, qr/Malformed UTF-8 character/, {},
+                "test that we handle some UTF-8 malformations without looping" );
+        }
+
 } # End of sub run_tests
 
 1;
