1 |
ns80 |
1790357 |
From 97cfdc3fa7dca759880d81e371901f4620279106 Mon Sep 17 00:00:00 2001 |
2 |
|
|
From: Sebastian Pipping <sebastian@pipping.org> |
3 |
|
|
Date: Tue, 8 Feb 2022 04:06:21 +0100 |
4 |
|
|
Subject: [PATCH] tests: Cover missing validation of encoding (CVE-2022-25235) |
5 |
|
|
|
6 |
|
|
--- |
7 |
|
|
expat/tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++ |
8 |
|
|
1 file changed, 109 insertions(+) |
9 |
|
|
|
10 |
|
|
--- a/tests/runtests.c |
11 |
|
|
+++ b/tests/runtests.c |
12 |
|
|
@@ -5997,6 +5997,105 @@ START_TEST(test_utf8_in_cdata_section_2) |
13 |
|
|
} |
14 |
|
|
END_TEST |
15 |
|
|
|
16 |
|
|
+START_TEST(test_utf8_in_start_tags) { |
17 |
|
|
+ struct test_case { |
18 |
|
|
+ bool goodName; |
19 |
|
|
+ bool goodNameStart; |
20 |
|
|
+ const char *tagName; |
21 |
|
|
+ }; |
22 |
|
|
+ |
23 |
|
|
+ // The idea with the tests below is this: |
24 |
|
|
+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences |
25 |
|
|
+ // go to isNever and are hence not a concern. |
26 |
|
|
+ // |
27 |
|
|
+ // We start with a character that is a valid name character |
28 |
|
|
+ // (or even name-start character, see XML 1.0r4 spec) and then we flip |
29 |
|
|
+ // single bits at places where (1) the result leaves the UTF-8 encoding space |
30 |
|
|
+ // and (2) we stay in the same n-byte sequence family. |
31 |
|
|
+ // |
32 |
|
|
+ // The flipped bits are highlighted in angle brackets in comments, |
33 |
|
|
+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped |
34 |
|
|
+ // the most significant bit to 1 to leave UTF-8 encoding space. |
35 |
|
|
+ struct test_case cases[] = { |
36 |
|
|
+ // 1-byte UTF-8: [0xxx xxxx] |
37 |
|
|
+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':' |
38 |
|
|
+ {false, false, "\xBA"}, // [<1>011 1010] |
39 |
|
|
+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9' |
40 |
|
|
+ {false, false, "\xB9"}, // [<1>011 1001] |
41 |
|
|
+ |
42 |
|
|
+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx] |
43 |
|
|
+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] = |
44 |
|
|
+ // Arabic small waw U+06E5 |
45 |
|
|
+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101] |
46 |
|
|
+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101] |
47 |
|
|
+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101] |
48 |
|
|
+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] = |
49 |
|
|
+ // combining char U+0301 |
50 |
|
|
+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001] |
51 |
|
|
+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001] |
52 |
|
|
+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001] |
53 |
|
|
+ |
54 |
|
|
+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx] |
55 |
|
|
+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] = |
56 |
|
|
+ // Devanagari Letter A U+0905 |
57 |
|
|
+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101] |
58 |
|
|
+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101] |
59 |
|
|
+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101] |
60 |
|
|
+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101] |
61 |
|
|
+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101] |
62 |
|
|
+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] = |
63 |
|
|
+ // combining char U+0901 |
64 |
|
|
+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001] |
65 |
|
|
+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001] |
66 |
|
|
+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001] |
67 |
|
|
+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001] |
68 |
|
|
+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001] |
69 |
|
|
+ }; |
70 |
|
|
+ const bool atNameStart[] = {true, false}; |
71 |
|
|
+ |
72 |
|
|
+ size_t i = 0; |
73 |
|
|
+ char doc[1024]; |
74 |
|
|
+ size_t failCount = 0; |
75 |
|
|
+ |
76 |
|
|
+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) { |
77 |
|
|
+ size_t j = 0; |
78 |
|
|
+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) { |
79 |
|
|
+ const bool expectedSuccess |
80 |
|
|
+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName; |
81 |
|
|
+ sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName); |
82 |
|
|
+ XML_Parser parser = XML_ParserCreate(NULL); |
83 |
|
|
+ |
84 |
|
|
+ const enum XML_Status status |
85 |
|
|
+ = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE); |
86 |
|
|
+ |
87 |
|
|
+ bool success = true; |
88 |
|
|
+ if ((status == XML_STATUS_OK) != expectedSuccess) { |
89 |
|
|
+ success = false; |
90 |
|
|
+ } |
91 |
|
|
+ if ((status == XML_STATUS_ERROR) |
92 |
|
|
+ && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) { |
93 |
|
|
+ success = false; |
94 |
|
|
+ } |
95 |
|
|
+ |
96 |
|
|
+ if (! success) { |
97 |
|
|
+ fprintf( |
98 |
|
|
+ stderr, |
99 |
|
|
+ "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n", |
100 |
|
|
+ (unsigned)i + 1u, atNameStart[j] ? " " : "not ", |
101 |
|
|
+ (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser)); |
102 |
|
|
+ failCount++; |
103 |
|
|
+ } |
104 |
|
|
+ |
105 |
|
|
+ XML_ParserFree(parser); |
106 |
|
|
+ } |
107 |
|
|
+ } |
108 |
|
|
+ |
109 |
|
|
+ if (failCount > 0) { |
110 |
|
|
+ fail("UTF-8 regression detected"); |
111 |
|
|
+ } |
112 |
|
|
+} |
113 |
|
|
+END_TEST |
114 |
|
|
+ |
115 |
|
|
/* Test trailing spaces in elements are accepted */ |
116 |
|
|
static void XMLCALL |
117 |
|
|
record_element_end_handler(void *userData, const XML_Char *name) { |
118 |
|
|
@@ -6174,6 +6273,14 @@ START_TEST(test_bad_doctype) { |
119 |
|
|
} |
120 |
|
|
END_TEST |
121 |
|
|
|
122 |
|
|
+START_TEST(test_bad_doctype_utf8) { |
123 |
|
|
+ const char *text = "<!DOCTYPE \xDB\x25" |
124 |
|
|
+ "doc><doc/>"; // [1101 1011] [<0>010 0101] |
125 |
|
|
+ expect_failure(text, XML_ERROR_INVALID_TOKEN, |
126 |
|
|
+ "Invalid UTF-8 in DOCTYPE not faulted"); |
127 |
|
|
+} |
128 |
|
|
+END_TEST |
129 |
|
|
+ |
130 |
|
|
START_TEST(test_bad_doctype_utf16) { |
131 |
|
|
const char text[] = |
132 |
|
|
/* <!DOCTYPE doc [ \x06f2 ]><doc/> |
133 |
|
|
@@ -11426,6 +11533,7 @@ make_suite(void) { |
134 |
|
|
tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom); |
135 |
|
|
tcase_add_test(tc_basic, test_utf8_in_cdata_section); |
136 |
|
|
tcase_add_test(tc_basic, test_utf8_in_cdata_section_2); |
137 |
|
|
+ tcase_add_test(tc_basic, test_utf8_in_start_tags); |
138 |
|
|
tcase_add_test(tc_basic, test_trailing_spaces_in_elements); |
139 |
|
|
tcase_add_test(tc_basic, test_utf16_attribute); |
140 |
|
|
tcase_add_test(tc_basic, test_utf16_second_attr); |
141 |
|
|
@@ -11434,6 +11542,7 @@ make_suite(void) { |
142 |
|
|
tcase_add_test(tc_basic, test_bad_attr_desc_keyword); |
143 |
|
|
tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16); |
144 |
|
|
tcase_add_test(tc_basic, test_bad_doctype); |
145 |
|
|
+ tcase_add_test(tc_basic, test_bad_doctype_utf8); |
146 |
|
|
tcase_add_test(tc_basic, test_bad_doctype_utf16); |
147 |
|
|
tcase_add_test(tc_basic, test_bad_doctype_plus); |
148 |
|
|
tcase_add_test(tc_basic, test_bad_doctype_star); |