1 |
From 97cfdc3fa7dca759880d81e371901f4620279106 Mon Sep 17 00:00:00 2001 |
2 |
From: Sebastian Pipping <sebastian@pipping.org> |
3 |
Date: Tue, 8 Feb 2022 04:06:21 +0100 |
4 |
Subject: [PATCH] tests: Cover missing validation of encoding (CVE-2022-25235) |
5 |
|
6 |
--- |
7 |
expat/tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++ |
8 |
1 file changed, 109 insertions(+) |
9 |
|
10 |
--- a/tests/runtests.c |
11 |
+++ b/tests/runtests.c |
12 |
@@ -5997,6 +5997,105 @@ START_TEST(test_utf8_in_cdata_section_2) |
13 |
} |
14 |
END_TEST |
15 |
|
16 |
+START_TEST(test_utf8_in_start_tags) { |
17 |
+ struct test_case { |
18 |
+ bool goodName; |
19 |
+ bool goodNameStart; |
20 |
+ const char *tagName; |
21 |
+ }; |
22 |
+ |
23 |
+ // The idea with the tests below is this: |
24 |
+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences |
25 |
+ // go to isNever and are hence not a concern. |
26 |
+ // |
27 |
+ // We start with a character that is a valid name character |
28 |
+ // (or even name-start character, see XML 1.0r4 spec) and then we flip |
29 |
+ // single bits at places where (1) the result leaves the UTF-8 encoding space |
30 |
+ // and (2) we stay in the same n-byte sequence family. |
31 |
+ // |
32 |
+ // The flipped bits are highlighted in angle brackets in comments, |
33 |
+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped |
34 |
+ // the most significant bit to 1 to leave UTF-8 encoding space. |
35 |
+ struct test_case cases[] = { |
36 |
+ // 1-byte UTF-8: [0xxx xxxx] |
37 |
+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':' |
38 |
+ {false, false, "\xBA"}, // [<1>011 1010] |
39 |
+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9' |
40 |
+ {false, false, "\xB9"}, // [<1>011 1001] |
41 |
+ |
42 |
+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx] |
43 |
+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] = |
44 |
+ // Arabic small waw U+06E5 |
45 |
+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101] |
46 |
+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101] |
47 |
+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101] |
48 |
+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] = |
49 |
+ // combining char U+0301 |
50 |
+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001] |
51 |
+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001] |
52 |
+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001] |
53 |
+ |
54 |
+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx] |
55 |
+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] = |
56 |
+ // Devanagari Letter A U+0905 |
57 |
+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101] |
58 |
+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101] |
59 |
+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101] |
60 |
+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101] |
61 |
+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101] |
62 |
+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] = |
63 |
+ // combining char U+0901 |
64 |
+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001] |
65 |
+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001] |
66 |
+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001] |
67 |
+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001] |
68 |
+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001] |
69 |
+ }; |
70 |
+ const bool atNameStart[] = {true, false}; |
71 |
+ |
72 |
+ size_t i = 0; |
73 |
+ char doc[1024]; |
74 |
+ size_t failCount = 0; |
75 |
+ |
76 |
+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) { |
77 |
+ size_t j = 0; |
78 |
+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) { |
79 |
+ const bool expectedSuccess |
80 |
+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName; |
81 |
+ sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName); |
82 |
+ XML_Parser parser = XML_ParserCreate(NULL); |
83 |
+ |
84 |
+ const enum XML_Status status |
85 |
+ = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE); |
86 |
+ |
87 |
+ bool success = true; |
88 |
+ if ((status == XML_STATUS_OK) != expectedSuccess) { |
89 |
+ success = false; |
90 |
+ } |
91 |
+ if ((status == XML_STATUS_ERROR) |
92 |
+ && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) { |
93 |
+ success = false; |
94 |
+ } |
95 |
+ |
96 |
+ if (! success) { |
97 |
+ fprintf( |
98 |
+ stderr, |
99 |
+ "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n", |
100 |
+ (unsigned)i + 1u, atNameStart[j] ? " " : "not ", |
101 |
+ (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser)); |
102 |
+ failCount++; |
103 |
+ } |
104 |
+ |
105 |
+ XML_ParserFree(parser); |
106 |
+ } |
107 |
+ } |
108 |
+ |
109 |
+ if (failCount > 0) { |
110 |
+ fail("UTF-8 regression detected"); |
111 |
+ } |
112 |
+} |
113 |
+END_TEST |
114 |
+ |
115 |
/* Test trailing spaces in elements are accepted */ |
116 |
static void XMLCALL |
117 |
record_element_end_handler(void *userData, const XML_Char *name) { |
118 |
@@ -6174,6 +6273,14 @@ START_TEST(test_bad_doctype) { |
119 |
} |
120 |
END_TEST |
121 |
|
122 |
+START_TEST(test_bad_doctype_utf8) { |
123 |
+ const char *text = "<!DOCTYPE \xDB\x25" |
124 |
+ "doc><doc/>"; // [1101 1011] [<0>010 0101] |
125 |
+ expect_failure(text, XML_ERROR_INVALID_TOKEN, |
126 |
+ "Invalid UTF-8 in DOCTYPE not faulted"); |
127 |
+} |
128 |
+END_TEST |
129 |
+ |
130 |
START_TEST(test_bad_doctype_utf16) { |
131 |
const char text[] = |
132 |
/* <!DOCTYPE doc [ \x06f2 ]><doc/> |
133 |
@@ -11426,6 +11533,7 @@ make_suite(void) { |
134 |
tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom); |
135 |
tcase_add_test(tc_basic, test_utf8_in_cdata_section); |
136 |
tcase_add_test(tc_basic, test_utf8_in_cdata_section_2); |
137 |
+ tcase_add_test(tc_basic, test_utf8_in_start_tags); |
138 |
tcase_add_test(tc_basic, test_trailing_spaces_in_elements); |
139 |
tcase_add_test(tc_basic, test_utf16_attribute); |
140 |
tcase_add_test(tc_basic, test_utf16_second_attr); |
141 |
@@ -11434,6 +11542,7 @@ make_suite(void) { |
142 |
tcase_add_test(tc_basic, test_bad_attr_desc_keyword); |
143 |
tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16); |
144 |
tcase_add_test(tc_basic, test_bad_doctype); |
145 |
+ tcase_add_test(tc_basic, test_bad_doctype_utf8); |
146 |
tcase_add_test(tc_basic, test_bad_doctype_utf16); |
147 |
tcase_add_test(tc_basic, test_bad_doctype_plus); |
148 |
tcase_add_test(tc_basic, test_bad_doctype_star); |