PERL PERL5 CHANGES 14 CHANGE 23261 INTEGRATE
Date: Sat, 4 Sep 2004 14:00:00 -0700

Subject: Change 23261: Integrate:
From: nick@no-spam (Nicholas Clark)

Change 23261 by nicholas@no-spam on 2004/09/04 20:30:30

Integrate:
[ 23074]
Subject: Re: Segfault using HTML::Entities From: Jarkko Hietaniemi <jhi@no-spam>
Message-ID: <40EDBE1A.6080205@no-spam>
Date: Fri, 09 Jul 2004 00:35:22 +0300

Affected files ...

... //depot/maint-5.8/perl/pp_ctl.c#58 integrate ... //depot/maint-5.8/perl/regexec.c#30 integrate ... //depot/maint-5.8/perl/t/run/fresh_perl.t#10 integrate
Differences ...

==== //depot/maint-5.8/perl/pp_ctl.c#58 (text) ====
Index: perl/pp_ctl.c --- perl/pp_ctl.c#57~22980~ Wed Jun 23 05:44:49 2004
+++ perl/pp_ctl.c Sat Sep 4 13:30:30 2004
@@no-spam -187,10 +187,13 @@no-spam {
SV *targ = cx->sb_targ;
- if (DO_UTF8(dstr) && !SvUTF8(targ))
- sv_catpvn_utf8_upgrade(dstr, s, cx->sb_strend - s, nsv);
- else - sv_catpvn(dstr, s, cx->sb_strend - s);
+ assert(cx->sb_strend >= s);
+ if(cx->sb_strend > s) {
+ if (DO_UTF8(dstr) && !SvUTF8(targ))
+ sv_catpvn_utf8_upgrade(dstr, s, cx->sb_strend - s, nsv);
+ else + sv_catpvn(dstr, s, cx->sb_strend - s);
+ }
cx->sb_rxtainted |= RX_MATCH_TAINTED(rx);
(void)SvOOK_off(targ);

==== //depot/maint-5.8/perl/regexec.c#30 (text) ====
Index: perl/regexec.c --- perl/regexec.c#29~22784~ Wed May 5 14:43:32 2004
+++ perl/regexec.c Sat Sep 4 13:30:30 2004
@@no-spam -954,6 +954,7 @@no-spam char *m;
STRLEN ln;
STRLEN lnc;
+ register STRLEN uskip;
unsigned int c1;
unsigned int c2;
char *e;
@@no-spam -964,7 +965,7 @@no-spam switch (OP(c)) {
case ANYOF:
if (do_utf8) {
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
!UTF8_IS_INVARIANT((U8)s[0]) ?
reginclass(c, (U8*)s, 0, do_utf8) :
@@no-spam -976,7 +977,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1172,7 +1173,7 @@no-spam tmp = ((OP(c) == BOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
LOAD_UTF8_CHARCLASS(alnum,"a");
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (tmp == !(OP(c) == BOUND ?
swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
isALNUM_LC_utf8((U8*)s)))
@@no-spam -1181,7 +1182,7 @@no-spam if ((norun || regtry(prog, s)))
goto got_it;
}
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1215,14 +1216,14 @@no-spam tmp = ((OP(c) == NBOUND ?
isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
LOAD_UTF8_CHARCLASS(alnum,"a");
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (tmp == !(OP(c) == NBOUND ?
swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
isALNUM_LC_utf8((U8*)s)))
tmp = !tmp;
else if ((norun || regtry(prog, s)))
goto got_it;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1244,7 +1245,7 @@no-spam case ALNUM:
if (do_utf8) {
LOAD_UTF8_CHARCLASS(alnum,"a");
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1253,7 +1254,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1273,7 +1274,7 @@no-spam case ALNUML:
PL_reg_flags |= RF_tainted;
if (do_utf8) {
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (isALNUM_LC_utf8((U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1282,7 +1283,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1302,7 +1303,7 @@no-spam case NALNUM:
if (do_utf8) {
LOAD_UTF8_CHARCLASS(alnum,"a");
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1311,7 +1312,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1331,7 +1332,7 @@no-spam case NALNUML:
PL_reg_flags |= RF_tainted;
if (do_utf8) {
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (!isALNUM_LC_utf8((U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1340,7 +1341,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1360,7 +1361,7 @@no-spam case SPACE:
if (do_utf8) {
LOAD_UTF8_CHARCLASS(space," ");
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1369,7 +1370,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1389,7 +1390,7 @@no-spam case SPACEL:
PL_reg_flags |= RF_tainted;
if (do_utf8) {
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1398,7 +1399,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1418,7 +1419,7 @@no-spam case NSPACE:
if (do_utf8) {
LOAD_UTF8_CHARCLASS(space," ");
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1427,7 +1428,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1447,7 +1448,7 @@no-spam case NSPACEL:
PL_reg_flags |= RF_tainted;
if (do_utf8) {
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1456,7 +1457,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1476,7 +1477,7 @@no-spam case DIGIT:
if (do_utf8) {
LOAD_UTF8_CHARCLASS(digit,"0");
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1485,7 +1486,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1505,7 +1506,7 @@no-spam case DIGITL:
PL_reg_flags |= RF_tainted;
if (do_utf8) {
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (isDIGIT_LC_utf8((U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1514,7 +1515,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1534,7 +1535,7 @@no-spam case NDIGIT:
if (do_utf8) {
LOAD_UTF8_CHARCLASS(digit,"0");
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1543,7 +1544,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {
@@no-spam -1563,7 +1564,7 @@no-spam case NDIGITL:
PL_reg_flags |= RF_tainted;
if (do_utf8) {
- while (s < strend) {
+ while (s + (uskip = UTF8SKIP(s)) <= strend) {
if (!isDIGIT_LC_utf8((U8*)s)) {
if (tmp && (norun || regtry(prog, s)))
goto got_it;
@@no-spam -1572,7 +1573,7 @@no-spam }
else tmp = 1;
- s += UTF8SKIP(s);
+ s += uskip;
}
}
else {

==== //depot/maint-5.8/perl/t/run/fresh_perl.t#10 (text) ====
Index: perl/t/run/fresh_perl.t --- perl/t/run/fresh_perl.t#9~21211~ Sat Sep 13 09:10:30 2003
+++ perl/t/run/fresh_perl.t Sat Sep 4 13:30:30 2004
@@no-spam -855,3 +855,19 @@no-spam EXPECT ./"TEST"
./"TEST"
+######## "Segfault using HTML::Entities", Richard Jolly <richardjolly@no-spam>, <A3C7D27E-C9F4-11D8-B294-003065AE00B6@no-spam> in perl-unicode@no-spam
+-lw +BEGIN {
+ eval 'require Encode';
+ if ($@no-spam { exit 0 } # running minitest?
+}
+# Test case cut down by jhi +$SIG{__WARN__} = sub { $@no-spam = shift };
+use Encode;
+my $t = "\xE9";
+Encode::_utf8_on($t);
+$t =~ s/([^a])//ge;
+$@no-spam =~ s/ at .*/ at/;
+print $@no-spam +EXPECT +Malformed UTF-8 character (unexpected end of string) at End of Patch.