current/SOURCES/CVE-2022-25235-2.patch

From 97cfdc3fa7dca759880d81e371901f4620279106 Mon Sep 17 00:00:00 2001
From: Sebastian Pipping <sebastian@pipping.org>
Date: Tue, 8 Feb 2022 04:06:21 +0100
Subject: [PATCH] tests: Cover missing validation of encoding (CVE-2022-25235)

---
 expat/tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

--- a/tests/runtests.c
+++ b/tests/runtests.c
@@ -5997,6 +5997,105 @@ START_TEST(test_utf8_in_cdata_section_2)
 }
 END_TEST
 
+START_TEST(test_utf8_in_start_tags) {
+  struct test_case {
+    bool goodName;
+    bool goodNameStart;
+    const char *tagName;
+  };
+
+  // The idea with the tests below is this:
+  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
+  // go to isNever and are hence not a concern.
+  //
+  // We start with a character that is a valid name character
+  // (or even name-start character, see XML 1.0r4 spec) and then we flip
+  // single bits at places where (1) the result leaves the UTF-8 encoding space
+  // and (2) we stay in the same n-byte sequence family.
+  //
+  // The flipped bits are highlighted in angle brackets in comments,
+  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
+  // the most significant bit to 1 to leave UTF-8 encoding space.
+  struct test_case cases[] = {
+      // 1-byte UTF-8: [0xxx xxxx]
+      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
+      {false, false, "\xBA"}, // [<1>011 1010]
+      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
+      {false, false, "\xB9"}, // [<1>011 1001]
+
+      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
+      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
+                                  // Arabic small waw U+06E5
+      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
+      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
+      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
+      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
+                                  // combining char U+0301
+      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
+      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
+      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
+
+      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
+      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
+                                      // Devanagari Letter A U+0905
+      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
+      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
+      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
+      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
+      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
+      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
+                                      // combining char U+0901
+      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
+      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
+      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
+      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
+      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
+  };
+  const bool atNameStart[] = {true, false};
+
+  size_t i = 0;
+  char doc[1024];
+  size_t failCount = 0;
+
+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    size_t j = 0;
+    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
+      const bool expectedSuccess
+          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
+      sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
+      XML_Parser parser = XML_ParserCreate(NULL);
+
+      const enum XML_Status status
+          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
+
+      bool success = true;
+      if ((status == XML_STATUS_OK) != expectedSuccess) {
+        success = false;
+      }
+      if ((status == XML_STATUS_ERROR)
+          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
+        success = false;
+      }
+
+      if (! success) {
+        fprintf(
+            stderr,
+            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
+            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
+            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
+        failCount++;
+      }
+
+      XML_ParserFree(parser);
+    }
+  }
+
+  if (failCount > 0) {
+    fail("UTF-8 regression detected");
+  }
+}
+END_TEST
+
 /* Test trailing spaces in elements are accepted */
 static void XMLCALL
 record_element_end_handler(void *userData, const XML_Char *name) {
@@ -6174,6 +6273,14 @@ START_TEST(test_bad_doctype) {
 }
 END_TEST
 
+START_TEST(test_bad_doctype_utf8) {
+  const char *text = "<!DOCTYPE \xDB\x25"
+                     "doc><doc/>"; // [1101 1011] [<0>010 0101]
+  expect_failure(text, XML_ERROR_INVALID_TOKEN,
+                 "Invalid UTF-8 in DOCTYPE not faulted");
+}
+END_TEST
+
 START_TEST(test_bad_doctype_utf16) {
   const char text[] =
       /* <!DOCTYPE doc [ \x06f2 ]><doc/>
@@ -11426,6 +11533,7 @@ make_suite(void) {
   tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
   tcase_add_test(tc_basic, test_utf8_in_cdata_section);
   tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
+  tcase_add_test(tc_basic, test_utf8_in_start_tags);
   tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
   tcase_add_test(tc_basic, test_utf16_attribute);
   tcase_add_test(tc_basic, test_utf16_second_attr);
@@ -11434,6 +11542,7 @@ make_suite(void) {
   tcase_add_test(tc_basic, test_bad_attr_desc_keyword);
   tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);
   tcase_add_test(tc_basic, test_bad_doctype);
+  tcase_add_test(tc_basic, test_bad_doctype_utf8);
   tcase_add_test(tc_basic, test_bad_doctype_utf16);
   tcase_add_test(tc_basic, test_bad_doctype_plus);
   tcase_add_test(tc_basic, test_bad_doctype_star);
1	ns80	1790357	From 97cfdc3fa7dca759880d81e371901f4620279106 Mon Sep 17 00:00:00 2001
2			From: Sebastian Pipping <sebastian@pipping.org>
3			Date: Tue, 8 Feb 2022 04:06:21 +0100
4			Subject: [PATCH] tests: Cover missing validation of encoding (CVE-2022-25235)
5
6			---
7			expat/tests/runtests.c \| 109 +++++++++++++++++++++++++++++++++++++++++
8			1 file changed, 109 insertions(+)
9
10			--- a/tests/runtests.c
11			+++ b/tests/runtests.c
12			@@ -5997,6 +5997,105 @@ START_TEST(test_utf8_in_cdata_section_2)
13			}
14			END_TEST
15
16			+START_TEST(test_utf8_in_start_tags) {
17			+ struct test_case {
18			+ bool goodName;
19			+ bool goodNameStart;
20			+ const char *tagName;
21			+ };
22			+
23			+ // The idea with the tests below is this:
24			+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
25			+ // go to isNever and are hence not a concern.
26			+ //
27			+ // We start with a character that is a valid name character
28			+ // (or even name-start character, see XML 1.0r4 spec) and then we flip
29			+ // single bits at places where (1) the result leaves the UTF-8 encoding space
30			+ // and (2) we stay in the same n-byte sequence family.
31			+ //
32			+ // The flipped bits are highlighted in angle brackets in comments,
33			+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
34			+ // the most significant bit to 1 to leave UTF-8 encoding space.
35			+ struct test_case cases[] = {
36			+ // 1-byte UTF-8: [0xxx xxxx]
37			+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':'
38			+ {false, false, "\xBA"}, // [<1>011 1010]
39			+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9'
40			+ {false, false, "\xB9"}, // [<1>011 1001]
41			+
42			+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
43			+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] =
44			+ // Arabic small waw U+06E5
45			+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
46			+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
47			+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
48			+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] =
49			+ // combining char U+0301
50			+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
51			+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
52			+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
53			+
54			+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
55			+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] =
56			+ // Devanagari Letter A U+0905
57			+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
58			+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
59			+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
60			+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
61			+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
62			+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] =
63			+ // combining char U+0901
64			+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
65			+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
66			+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
67			+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
68			+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
69			+ };
70			+ const bool atNameStart[] = {true, false};
71			+
72			+ size_t i = 0;
73			+ char doc[1024];
74			+ size_t failCount = 0;
75			+
76			+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
77			+ size_t j = 0;
78			+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
79			+ const bool expectedSuccess
80			+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
81			+ sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
82			+ XML_Parser parser = XML_ParserCreate(NULL);
83			+
84			+ const enum XML_Status status
85			+ = XML_Parse(parser, doc, (int)strlen(doc), /isFinal=/XML_FALSE);
86			+
87			+ bool success = true;
88			+ if ((status == XML_STATUS_OK) != expectedSuccess) {
89			+ success = false;
90			+ }
91			+ if ((status == XML_STATUS_ERROR)
92			+ && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
93			+ success = false;
94			+ }
95			+
96			+ if (! success) {
97			+ fprintf(
98			+ stderr,
99			+ "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
100			+ (unsigned)i + 1u, atNameStart[j] ? " " : "not ",
101			+ (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
102			+ failCount++;
103			+ }
104			+
105			+ XML_ParserFree(parser);
106			+ }
107			+ }
108			+
109			+ if (failCount > 0) {
110			+ fail("UTF-8 regression detected");
111			+ }
112			+}
113			+END_TEST
114			+
115			/* Test trailing spaces in elements are accepted */
116			static void XMLCALL
117			record_element_end_handler(void userData, const XML_Char name) {
118			@@ -6174,6 +6273,14 @@ START_TEST(test_bad_doctype) {
119			}
120			END_TEST
121
122			+START_TEST(test_bad_doctype_utf8) {
123			+ const char *text = "<!DOCTYPE \xDB\x25"
124			+ "doc><doc/>"; // [1101 1011] [<0>010 0101]
125			+ expect_failure(text, XML_ERROR_INVALID_TOKEN,
126			+ "Invalid UTF-8 in DOCTYPE not faulted");
127			+}
128			+END_TEST
129			+
130			START_TEST(test_bad_doctype_utf16) {
131			const char text[] =
132			/* <!DOCTYPE doc [ \x06f2 ]><doc/>
133			@@ -11426,6 +11533,7 @@ make_suite(void) {
134			tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
135			tcase_add_test(tc_basic, test_utf8_in_cdata_section);
136			tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
137			+ tcase_add_test(tc_basic, test_utf8_in_start_tags);
138			tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
139			tcase_add_test(tc_basic, test_utf16_attribute);
140			tcase_add_test(tc_basic, test_utf16_second_attr);
141			@@ -11434,6 +11542,7 @@ make_suite(void) {
142			tcase_add_test(tc_basic, test_bad_attr_desc_keyword);
143			tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);
144			tcase_add_test(tc_basic, test_bad_doctype);
145			+ tcase_add_test(tc_basic, test_bad_doctype_utf8);
146			tcase_add_test(tc_basic, test_bad_doctype_utf16);
147			tcase_add_test(tc_basic, test_bad_doctype_plus);
148			tcase_add_test(tc_basic, test_bad_doctype_star);