1 |
--- src/search.c.orig |
2 |
+++ src/search.c |
3 |
@@ -18,10 +18,15 @@ |
4 |
|
5 |
/* Written August 1992 by Mike Haertel. */ |
6 |
|
7 |
+#ifndef _GNU_SOURCE |
8 |
+# define _GNU_SOURCE 1 |
9 |
+#endif |
10 |
#ifdef HAVE_CONFIG_H |
11 |
# include <config.h> |
12 |
#endif |
13 |
|
14 |
+#include <assert.h> |
15 |
+ |
16 |
#include <sys/types.h> |
17 |
|
18 |
#include "mbsupport.h" |
19 |
@@ -43,6 +48,9 @@ |
20 |
#ifdef HAVE_LIBPCRE |
21 |
# include <pcre.h> |
22 |
#endif |
23 |
+#ifdef HAVE_LANGINFO_CODESET |
24 |
+# include <langinfo.h> |
25 |
+#endif |
26 |
|
27 |
#define NCHAR (UCHAR_MAX + 1) |
28 |
|
29 |
@@ -68,6 +76,19 @@ |
30 |
error (2, 0, _("memory exhausted")); |
31 |
} |
32 |
|
33 |
+/* UTF-8 encoding allows some optimizations that we can't otherwise |
34 |
+ assume in a multibyte encoding. */ |
35 |
+static int using_utf8; |
36 |
+ |
37 |
+void |
38 |
+check_utf8 (void) |
39 |
+{ |
40 |
+#ifdef HAVE_LANGINFO_CODESET |
41 |
+ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) |
42 |
+ using_utf8 = 1; |
43 |
+#endif |
44 |
+} |
45 |
+ |
46 |
#ifndef FGREP_PROGRAM |
47 |
/* DFA compiled regexp. */ |
48 |
static struct dfa dfa; |
49 |
@@ -134,49 +155,6 @@ |
50 |
} |
51 |
#endif /* !FGREP_PROGRAM */ |
52 |
|
53 |
-#ifdef MBS_SUPPORT |
54 |
-/* This function allocate the array which correspond to "buf". |
55 |
- Then this check multibyte string and mark on the positions which |
56 |
- are not single byte character nor the first byte of a multibyte |
57 |
- character. Caller must free the array. */ |
58 |
-static char* |
59 |
-check_multibyte_string(char const *buf, size_t size) |
60 |
-{ |
61 |
- char *mb_properties = xmalloc(size); |
62 |
- mbstate_t cur_state; |
63 |
- wchar_t wc; |
64 |
- int i; |
65 |
- |
66 |
- memset(&cur_state, 0, sizeof(mbstate_t)); |
67 |
- memset(mb_properties, 0, sizeof(char)*size); |
68 |
- |
69 |
- for (i = 0; i < size ;) |
70 |
- { |
71 |
- size_t mbclen; |
72 |
- mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); |
73 |
- |
74 |
- if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) |
75 |
- { |
76 |
- /* An invalid sequence, or a truncated multibyte character. |
77 |
- We treat it as a single byte character. */ |
78 |
- mbclen = 1; |
79 |
- } |
80 |
- else if (match_icase) |
81 |
- { |
82 |
- if (iswupper((wint_t)wc)) |
83 |
- { |
84 |
- wc = towlower((wint_t)wc); |
85 |
- wcrtomb(buf + i, wc, &cur_state); |
86 |
- } |
87 |
- } |
88 |
- mb_properties[i] = mbclen; |
89 |
- i += mbclen; |
90 |
- } |
91 |
- |
92 |
- return mb_properties; |
93 |
-} |
94 |
-#endif /* MBS_SUPPORT */ |
95 |
- |
96 |
#if defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) |
97 |
#ifdef EGREP_PROGRAM |
98 |
COMPILE_FCT(Ecompile) |
99 |
@@ -193,6 +171,7 @@ |
100 |
size_t total = size; |
101 |
char const *motif = pattern; |
102 |
|
103 |
+ check_utf8 (); |
104 |
#if 0 |
105 |
if (match_icase) |
106 |
syntax_bits |= RE_ICASE; |
107 |
#@@ -303,47 +282,78 @@ hunk6 |
108 |
@@ -303,20 +282,9 @@ hunk6 |
109 |
struct kwsmatch kwsm; |
110 |
size_t i, ret_val; |
111 |
#ifdef MBS_SUPPORT |
112 |
- char *mb_properties = NULL; |
113 |
- if (MB_CUR_MAX > 1) |
114 |
- { |
115 |
- if (match_icase) |
116 |
- { |
117 |
- char *case_buf = xmalloc(size); |
118 |
- memcpy(case_buf, buf, size); |
119 |
- if (start_ptr) |
120 |
- start_ptr = case_buf + (start_ptr - buf); |
121 |
- buf = case_buf; |
122 |
- } |
123 |
- if (kwset) |
124 |
- mb_properties = check_multibyte_string(buf, size); |
125 |
- } |
126 |
+ int mb_cur_max = MB_CUR_MAX; |
127 |
+ mbstate_t mbs; |
128 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
129 |
#endif /* MBS_SUPPORT */ |
130 |
|
131 |
buflim = buf + size; |
132 |
@@ -329,21 +282,63 @@ hunk6 |
133 |
if (kwset) |
134 |
{ |
135 |
/* Find a possible match using the KWset matcher. */ |
136 |
- size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); |
137 |
+#ifdef MBS_SUPPORT |
138 |
+ size_t bytes_left = 0; |
139 |
+#endif /* MBS_SUPPORT */ |
140 |
+ size_t offset; |
141 |
+#ifdef MBS_SUPPORT |
142 |
+ /* kwsexec doesn't work with match_icase and multibyte input. */ |
143 |
+ if (match_icase && mb_cur_max > 1) |
144 |
+ /* Avoid kwset */ |
145 |
+ offset = 0; |
146 |
+ else |
147 |
+#endif /* MBS_SUPPORT */ |
148 |
+ offset = kwsexec (kwset, beg, buflim - beg, &kwsm); |
149 |
if (offset == (size_t) -1) |
150 |
- goto failure; |
151 |
+ return (size_t)-1; |
152 |
+#ifdef MBS_SUPPORT |
153 |
+ if (mb_cur_max > 1 && !using_utf8) |
154 |
+ { |
155 |
+ bytes_left = offset; |
156 |
+ while (bytes_left) |
157 |
+ { |
158 |
+ size_t mlen = mbrlen (beg, bytes_left, &mbs); |
159 |
+ if (mlen == (size_t) -1 || mlen == 0) |
160 |
+ { |
161 |
+ /* Incomplete character: treat as single-byte. */ |
162 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
163 |
+ beg++; |
164 |
+ bytes_left--; |
165 |
+ continue; |
166 |
+ } |
167 |
+ |
168 |
+ if (mlen == (size_t) -2) |
169 |
+ /* Offset points inside multibyte character: |
170 |
+ * no good. */ |
171 |
+ break; |
172 |
+ |
173 |
+ beg += mlen; |
174 |
+ bytes_left -= mlen; |
175 |
+ } |
176 |
+ } |
177 |
+ else |
178 |
+#endif /* MBS_SUPPORT */ |
179 |
beg += offset; |
180 |
/* Narrow down to the line containing the candidate, and |
181 |
run it through DFA. */ |
182 |
end = memchr(beg, eol, buflim - beg); |
183 |
end++; |
184 |
#ifdef MBS_SUPPORT |
185 |
- if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) |
186 |
+ if (mb_cur_max > 1 && bytes_left) |
187 |
continue; |
188 |
#endif |
189 |
while (beg > buf && beg[-1] != eol) |
190 |
--beg; |
191 |
- if (kwsm.index < kwset_exact_matches) |
192 |
+ if ( |
193 |
+#ifdef MBS_SUPPORT |
194 |
+ !(match_icase && mb_cur_max > 1) && |
195 |
+#endif /* MBS_SUPPORT */ |
196 |
+ (kwsm.index < kwset_exact_matches)) |
197 |
goto success; |
198 |
if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) |
199 |
continue; |
200 |
@@ -351,13 +363,47 @@ |
201 |
else |
202 |
{ |
203 |
/* No good fixed strings; start with DFA. */ |
204 |
+#ifdef MBS_SUPPORT |
205 |
+ size_t bytes_left = 0; |
206 |
+#endif /* MBS_SUPPORT */ |
207 |
size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); |
208 |
if (offset == (size_t) -1) |
209 |
break; |
210 |
/* Narrow down to the line we've found. */ |
211 |
+#ifdef MBS_SUPPORT |
212 |
+ if (mb_cur_max > 1 && !using_utf8) |
213 |
+ { |
214 |
+ bytes_left = offset; |
215 |
+ while (bytes_left) |
216 |
+ { |
217 |
+ size_t mlen = mbrlen (beg, bytes_left, &mbs); |
218 |
+ if (mlen == (size_t) -1 || mlen == 0) |
219 |
+ { |
220 |
+ /* Incomplete character: treat as single-byte. */ |
221 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
222 |
+ beg++; |
223 |
+ bytes_left--; |
224 |
+ continue; |
225 |
+ } |
226 |
+ |
227 |
+ if (mlen == (size_t) -2) |
228 |
+ /* Offset points inside multibyte character: |
229 |
+ * no good. */ |
230 |
+ break; |
231 |
+ |
232 |
+ beg += mlen; |
233 |
+ bytes_left -= mlen; |
234 |
+ } |
235 |
+ } |
236 |
+ else |
237 |
+#endif /* MBS_SUPPORT */ |
238 |
beg += offset; |
239 |
end = memchr (beg, eol, buflim - beg); |
240 |
end++; |
241 |
+#ifdef MBS_SUPPORT |
242 |
+ if (mb_cur_max > 1 && bytes_left) |
243 |
+ continue; |
244 |
+#endif /* MBS_SUPPORT */ |
245 |
while (beg > buf && beg[-1] != eol) |
246 |
--beg; |
247 |
} |
248 |
@@ -475,24 +521,144 @@ |
249 |
*match_size = len; |
250 |
ret_val = beg - buf; |
251 |
out: |
252 |
-#ifdef MBS_SUPPORT |
253 |
- if (MB_CUR_MAX > 1) |
254 |
- { |
255 |
- if (match_icase) |
256 |
- free((char*)buf); |
257 |
- if (mb_properties) |
258 |
- free(mb_properties); |
259 |
- } |
260 |
-#endif /* MBS_SUPPORT */ |
261 |
return ret_val; |
262 |
} |
263 |
#endif /* defined(GREP_PROGRAM) || defined(EGREP_PROGRAM) */ |
264 |
|
265 |
+#ifdef MBS_SUPPORT |
266 |
+static int f_i_multibyte; /* whether we're using the new -Fi MB method */ |
267 |
+static struct |
268 |
+{ |
269 |
+ wchar_t **patterns; |
270 |
+ size_t count, maxlen; |
271 |
+ unsigned char *match; |
272 |
+} Fimb; |
273 |
+#endif |
274 |
+ |
275 |
#if defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) |
276 |
COMPILE_FCT(Fcompile) |
277 |
{ |
278 |
+ int mb_cur_max = MB_CUR_MAX; |
279 |
char const *beg, *lim, *err; |
280 |
|
281 |
+ check_utf8 (); |
282 |
+#ifdef MBS_SUPPORT |
283 |
+ /* Support -F -i for UTF-8 input. */ |
284 |
+ if (match_icase && mb_cur_max > 1) |
285 |
+ { |
286 |
+ mbstate_t mbs; |
287 |
+ wchar_t *wcpattern = xmalloc ((size + 1) * sizeof (wchar_t)); |
288 |
+ const char *patternend = pattern; |
289 |
+ size_t wcsize; |
290 |
+ kwset_t fimb_kwset = NULL; |
291 |
+ char *starts = NULL; |
292 |
+ wchar_t *wcbeg, *wclim; |
293 |
+ size_t allocated = 0; |
294 |
+ |
295 |
+ memset (&mbs, '\0', sizeof (mbs)); |
296 |
+# ifdef __GNU_LIBRARY__ |
297 |
+ wcsize = mbsnrtowcs (wcpattern, &patternend, size, size, &mbs); |
298 |
+ if (patternend != pattern + size) |
299 |
+ wcsize = (size_t) -1; |
300 |
+# else |
301 |
+ { |
302 |
+ char *patterncopy = xmalloc (size + 1); |
303 |
+ |
304 |
+ memcpy (patterncopy, pattern, size); |
305 |
+ patterncopy[size] = '\0'; |
306 |
+ patternend = patterncopy; |
307 |
+ wcsize = mbsrtowcs (wcpattern, &patternend, size, &mbs); |
308 |
+ if (patternend != patterncopy + size) |
309 |
+ wcsize = (size_t) -1; |
310 |
+ free (patterncopy); |
311 |
+ } |
312 |
+# endif |
313 |
+ if (wcsize + 2 <= 2) |
314 |
+ { |
315 |
+fimb_fail: |
316 |
+ free (wcpattern); |
317 |
+ free (starts); |
318 |
+ if (fimb_kwset) |
319 |
+ kwsfree (fimb_kwset); |
320 |
+ free (Fimb.patterns); |
321 |
+ Fimb.patterns = NULL; |
322 |
+ } |
323 |
+ else |
324 |
+ { |
325 |
+ if (!(fimb_kwset = kwsalloc (NULL))) |
326 |
+ error (2, 0, _("memory exhausted")); |
327 |
+ |
328 |
+ starts = xmalloc (mb_cur_max * 3); |
329 |
+ wcbeg = wcpattern; |
330 |
+ do |
331 |
+ { |
332 |
+ int i; |
333 |
+ size_t wclen; |
334 |
+ |
335 |
+ if (Fimb.count >= allocated) |
336 |
+ { |
337 |
+ if (allocated == 0) |
338 |
+ allocated = 128; |
339 |
+ else |
340 |
+ allocated *= 2; |
341 |
+ Fimb.patterns = xrealloc (Fimb.patterns, |
342 |
+ sizeof (wchar_t *) * allocated); |
343 |
+ } |
344 |
+ Fimb.patterns[Fimb.count++] = wcbeg; |
345 |
+ for (wclim = wcbeg; |
346 |
+ wclim < wcpattern + wcsize && *wclim != L'\n'; ++wclim) |
347 |
+ *wclim = towlower (*wclim); |
348 |
+ *wclim = L'\0'; |
349 |
+ wclen = wclim - wcbeg; |
350 |
+ if (wclen > Fimb.maxlen) |
351 |
+ Fimb.maxlen = wclen; |
352 |
+ if (wclen > 3) |
353 |
+ wclen = 3; |
354 |
+ if (wclen == 0) |
355 |
+ { |
356 |
+ if ((err = kwsincr (fimb_kwset, "", 0)) != 0) |
357 |
+ error (2, 0, err); |
358 |
+ } |
359 |
+ else |
360 |
+ for (i = 0; i < (1 << wclen); i++) |
361 |
+ { |
362 |
+ char *p = starts; |
363 |
+ int j, k; |
364 |
+ |
365 |
+ for (j = 0; j < wclen; ++j) |
366 |
+ { |
367 |
+ wchar_t wc = wcbeg[j]; |
368 |
+ if (i & (1 << j)) |
369 |
+ { |
370 |
+ wc = towupper (wc); |
371 |
+ if (wc == wcbeg[j]) |
372 |
+ continue; |
373 |
+ } |
374 |
+ k = wctomb (p, wc); |
375 |
+ if (k <= 0) |
376 |
+ goto fimb_fail; |
377 |
+ p += k; |
378 |
+ } |
379 |
+ if ((err = kwsincr (fimb_kwset, starts, p - starts)) != 0) |
380 |
+ error (2, 0, err); |
381 |
+ } |
382 |
+ if (wclim < wcpattern + wcsize) |
383 |
+ ++wclim; |
384 |
+ wcbeg = wclim; |
385 |
+ } |
386 |
+ while (wcbeg < wcpattern + wcsize); |
387 |
+ f_i_multibyte = 1; |
388 |
+ kwset = fimb_kwset; |
389 |
+ free (starts); |
390 |
+ Fimb.match = xmalloc (Fimb.count); |
391 |
+ if ((err = kwsprep (kwset)) != 0) |
392 |
+ error (2, 0, err); |
393 |
+ return; |
394 |
+ } |
395 |
+ } |
396 |
+#endif /* MBS_SUPPORT */ |
397 |
+ |
398 |
+ |
399 |
kwsinit (); |
400 |
beg = pattern; |
401 |
do |
402 |
@@ -511,6 +677,76 @@ |
403 |
error (2, 0, err); |
404 |
} |
405 |
|
406 |
+#ifdef MBS_SUPPORT |
407 |
+static int |
408 |
+Fimbexec (const char *buf, size_t size, size_t *plen, int exact) |
409 |
+{ |
410 |
+ size_t len, letter, i; |
411 |
+ int ret = -1; |
412 |
+ mbstate_t mbs; |
413 |
+ wchar_t wc; |
414 |
+ int patterns_left; |
415 |
+ |
416 |
+ assert (match_icase && f_i_multibyte == 1); |
417 |
+ assert (MB_CUR_MAX > 1); |
418 |
+ |
419 |
+ memset (&mbs, '\0', sizeof (mbs)); |
420 |
+ memset (Fimb.match, '\1', Fimb.count); |
421 |
+ letter = len = 0; |
422 |
+ patterns_left = 1; |
423 |
+ while (patterns_left && len <= size) |
424 |
+ { |
425 |
+ size_t c; |
426 |
+ |
427 |
+ patterns_left = 0; |
428 |
+ if (len < size) |
429 |
+ { |
430 |
+ c = mbrtowc (&wc, buf + len, size - len, &mbs); |
431 |
+ if (c + 2 <= 2) |
432 |
+ return ret; |
433 |
+ |
434 |
+ wc = towlower (wc); |
435 |
+ } |
436 |
+ else |
437 |
+ { |
438 |
+ c = 1; |
439 |
+ wc = L'\0'; |
440 |
+ } |
441 |
+ |
442 |
+ for (i = 0; i < Fimb.count; i++) |
443 |
+ { |
444 |
+ if (Fimb.match[i]) |
445 |
+ { |
446 |
+ if (Fimb.patterns[i][letter] == L'\0') |
447 |
+ { |
448 |
+ /* Found a match. */ |
449 |
+ *plen = len; |
450 |
+ if (!exact && !match_words) |
451 |
+ return 0; |
452 |
+ else |
453 |
+ { |
454 |
+ /* For -w or exact look for longest match. */ |
455 |
+ ret = 0; |
456 |
+ Fimb.match[i] = '\0'; |
457 |
+ continue; |
458 |
+ } |
459 |
+ } |
460 |
+ |
461 |
+ if (Fimb.patterns[i][letter] == wc) |
462 |
+ patterns_left = 1; |
463 |
+ else |
464 |
+ Fimb.match[i] = '\0'; |
465 |
+ } |
466 |
+ } |
467 |
+ |
468 |
+ len += c; |
469 |
+ letter++; |
470 |
+ } |
471 |
+ |
472 |
+ return ret; |
473 |
+} |
474 |
+#endif /* MBS_SUPPORT */ |
475 |
+ |
476 |
EXECUTE_FCT(Fexecute) |
477 |
{ |
478 |
register char const *beg, *try, *end; |
479 |
@@ -519,69 +755,256 @@ |
480 |
struct kwsmatch kwsmatch; |
481 |
size_t ret_val; |
482 |
#ifdef MBS_SUPPORT |
483 |
- char *mb_properties = NULL; |
484 |
- if (MB_CUR_MAX > 1) |
485 |
- { |
486 |
- if (match_icase) |
487 |
- { |
488 |
- char *case_buf = xmalloc(size); |
489 |
- memcpy(case_buf, buf, size); |
490 |
- if (start_ptr) |
491 |
- start_ptr = case_buf + (start_ptr - buf); |
492 |
- buf = case_buf; |
493 |
- } |
494 |
- mb_properties = check_multibyte_string(buf, size); |
495 |
- } |
496 |
+ int mb_cur_max = MB_CUR_MAX; |
497 |
+ mbstate_t mbs; |
498 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
499 |
+ const char *last_char = NULL; |
500 |
#endif /* MBS_SUPPORT */ |
501 |
|
502 |
for (beg = start_ptr ? start_ptr : buf; beg <= buf + size; beg++) |
503 |
{ |
504 |
size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); |
505 |
if (offset == (size_t) -1) |
506 |
- goto failure; |
507 |
+ return offset; |
508 |
#ifdef MBS_SUPPORT |
509 |
- if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) |
510 |
- continue; /* It is a part of multibyte character. */ |
511 |
+ if (mb_cur_max > 1 && !using_utf8) |
512 |
+ { |
513 |
+ size_t bytes_left = offset; |
514 |
+ while (bytes_left) |
515 |
+ { |
516 |
+ size_t mlen = mbrlen (beg, bytes_left, &mbs); |
517 |
+ |
518 |
+ last_char = beg; |
519 |
+ if (mlen == (size_t) -1 || mlen == 0) |
520 |
+ { |
521 |
+ /* Incomplete character: treat as single-byte. */ |
522 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
523 |
+ beg++; |
524 |
+ bytes_left--; |
525 |
+ continue; |
526 |
+ } |
527 |
+ |
528 |
+ if (mlen == (size_t) -2) |
529 |
+ /* Offset points inside multibyte character: no good. */ |
530 |
+ break; |
531 |
+ |
532 |
+ beg += mlen; |
533 |
+ bytes_left -= mlen; |
534 |
+ } |
535 |
+ |
536 |
+ if (bytes_left) |
537 |
+ continue; |
538 |
+ } |
539 |
+ else |
540 |
#endif /* MBS_SUPPORT */ |
541 |
beg += offset; |
542 |
+#ifdef MBS_SUPPORT |
543 |
+ /* For f_i_multibyte, the string at beg now matches first 3 chars of |
544 |
+ one of the search strings (less if there are shorter search strings). |
545 |
+ See if this is a real match. */ |
546 |
+ if (f_i_multibyte |
547 |
+ && Fimbexec (beg, buf + size - beg, &kwsmatch.size[0], start_ptr == NULL)) |
548 |
+ goto next_char; |
549 |
+#endif /* MBS_SUPPORT */ |
550 |
len = kwsmatch.size[0]; |
551 |
if (start_ptr && !match_words) |
552 |
goto success_in_beg_and_len; |
553 |
if (match_lines) |
554 |
{ |
555 |
if (beg > buf && beg[-1] != eol) |
556 |
- continue; |
557 |
+ goto next_char; |
558 |
if (beg + len < buf + size && beg[len] != eol) |
559 |
- continue; |
560 |
+ goto next_char; |
561 |
goto success; |
562 |
} |
563 |
else if (match_words) |
564 |
- for (try = beg; len; ) |
565 |
- { |
566 |
- if (try > buf && WCHAR((unsigned char) try[-1])) |
567 |
- break; |
568 |
- if (try + len < buf + size && WCHAR((unsigned char) try[len])) |
569 |
- { |
570 |
- offset = kwsexec (kwset, beg, --len, &kwsmatch); |
571 |
- if (offset == (size_t) -1) |
572 |
- break; |
573 |
- try = beg + offset; |
574 |
- len = kwsmatch.size[0]; |
575 |
- } |
576 |
- else if (!start_ptr) |
577 |
- goto success; |
578 |
- else |
579 |
- goto success_in_beg_and_len; |
580 |
- } /* for (try) */ |
581 |
- else |
582 |
- goto success; |
583 |
- } /* for (beg in buf) */ |
584 |
+ { |
585 |
+ while (len) |
586 |
+ { |
587 |
+ int word_match = 0; |
588 |
+ if (beg > buf) |
589 |
+ { |
590 |
+#ifdef MBS_SUPPORT |
591 |
+ if (mb_cur_max > 1) |
592 |
+ { |
593 |
+ const char *s; |
594 |
+ int mr; |
595 |
+ wchar_t pwc; |
596 |
+ |
597 |
+ if (using_utf8) |
598 |
+ { |
599 |
+ s = beg - 1; |
600 |
+ while (s > buf |
601 |
+ && (unsigned char) *s >= 0x80 |
602 |
+ && (unsigned char) *s <= 0xbf) |
603 |
+ --s; |
604 |
+ } |
605 |
+ else |
606 |
+ s = last_char; |
607 |
+ mr = mbtowc (&pwc, s, beg - s); |
608 |
+ if (mr <= 0) |
609 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
610 |
+ else if ((iswalnum (pwc) || pwc == L'_') |
611 |
+ && mr == (int) (beg - s)) |
612 |
+ goto next_char; |
613 |
+ } |
614 |
+ else |
615 |
+#endif /* MBS_SUPPORT */ |
616 |
+ if (WCHAR ((unsigned char) beg[-1])) |
617 |
+ goto next_char; |
618 |
+ } |
619 |
+#ifdef MBS_SUPPORT |
620 |
+ if (mb_cur_max > 1) |
621 |
+ { |
622 |
+ wchar_t nwc; |
623 |
+ int mr; |
624 |
|
625 |
- failure: |
626 |
- ret_val = -1; |
627 |
- goto out; |
628 |
+ mr = mbtowc (&nwc, beg + len, buf + size - beg - len); |
629 |
+ if (mr <= 0) |
630 |
+ { |
631 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
632 |
+ word_match = 1; |
633 |
+ } |
634 |
+ else if (!iswalnum (nwc) && nwc != L'_') |
635 |
+ word_match = 1; |
636 |
+ } |
637 |
+ else |
638 |
+#endif /* MBS_SUPPORT */ |
639 |
+ if (beg + len >= buf + size || !WCHAR ((unsigned char) beg[len])) |
640 |
+ word_match = 1; |
641 |
+ if (word_match) |
642 |
+ { |
643 |
+ if (start_ptr == NULL) |
644 |
+ /* Returns the whole line now we know there's a word match. */ |
645 |
+ goto success; |
646 |
+ else { |
647 |
+ /* Returns just this word match. */ |
648 |
+ *match_size = len; |
649 |
+ return beg - buf; |
650 |
+ } |
651 |
+ } |
652 |
+ if (len > 0) |
653 |
+ { |
654 |
+ /* Try a shorter length anchored at the same place. */ |
655 |
+ --len; |
656 |
+ offset = kwsexec (kwset, beg, len, &kwsmatch); |
657 |
+ |
658 |
+ if (offset == -1) |
659 |
+ goto next_char; /* Try a different anchor. */ |
660 |
+#ifdef MBS_SUPPORT |
661 |
+ |
662 |
+ if (mb_cur_max > 1 && !using_utf8) |
663 |
+ { |
664 |
+ size_t bytes_left = offset; |
665 |
+ while (bytes_left) |
666 |
+ { |
667 |
+ size_t mlen = mbrlen (beg, bytes_left, &mbs); |
668 |
+ |
669 |
+ last_char = beg; |
670 |
+ if (mlen == (size_t) -1 || mlen == 0) |
671 |
+ { |
672 |
+ /* Incomplete character: treat as single-byte. */ |
673 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
674 |
+ beg++; |
675 |
+ bytes_left--; |
676 |
+ continue; |
677 |
+ } |
678 |
+ |
679 |
+ if (mlen == (size_t) -2) |
680 |
+ { |
681 |
+ /* Offset points inside multibyte character: |
682 |
+ * no good. */ |
683 |
+ break; |
684 |
+ } |
685 |
+ |
686 |
+ beg += mlen; |
687 |
+ bytes_left -= mlen; |
688 |
+ } |
689 |
+ |
690 |
+ if (bytes_left) |
691 |
+ { |
692 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
693 |
+ goto next_char; /* Try a different anchor. */ |
694 |
+ } |
695 |
+ } |
696 |
+ else |
697 |
+#endif /* MBS_SUPPORT */ |
698 |
+ beg += offset; |
699 |
+#ifdef MBS_SUPPORT |
700 |
+ /* The string at beg now matches first 3 chars of one of |
701 |
+ the search strings (less if there are shorter search |
702 |
+ strings). See if this is a real match. */ |
703 |
+ if (f_i_multibyte |
704 |
+ && Fimbexec (beg, len - offset, &kwsmatch.size[0], |
705 |
+ start_ptr == NULL)) |
706 |
+ goto next_char; |
707 |
+#endif /* MBS_SUPPORT */ |
708 |
+ len = kwsmatch.size[0]; |
709 |
+ } |
710 |
+ } |
711 |
+ } |
712 |
+ else |
713 |
+ goto success; |
714 |
+next_char:; |
715 |
+#ifdef MBS_SUPPORT |
716 |
+ /* Advance to next character. For MB_CUR_MAX == 1 case this is handled |
717 |
+ by ++beg above. */ |
718 |
+ if (mb_cur_max > 1) |
719 |
+ { |
720 |
+ if (using_utf8) |
721 |
+ { |
722 |
+ unsigned char c = *beg; |
723 |
+ if (c >= 0xc2) |
724 |
+ { |
725 |
+ if (c < 0xe0) |
726 |
+ ++beg; |
727 |
+ else if (c < 0xf0) |
728 |
+ beg += 2; |
729 |
+ else if (c < 0xf8) |
730 |
+ beg += 3; |
731 |
+ else if (c < 0xfc) |
732 |
+ beg += 4; |
733 |
+ else if (c < 0xfe) |
734 |
+ beg += 5; |
735 |
+ } |
736 |
+ } |
737 |
+ else |
738 |
+ { |
739 |
+ size_t l = mbrlen (beg, buf + size - beg, &mbs); |
740 |
+ |
741 |
+ last_char = beg; |
742 |
+ if (l + 2 >= 2) |
743 |
+ beg += l - 1; |
744 |
+ else |
745 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
746 |
+ } |
747 |
+ } |
748 |
+#endif /* MBS_SUPPORT */ |
749 |
+ } |
750 |
+ |
751 |
+ return -1; |
752 |
|
753 |
success: |
754 |
+#ifdef MBS_SUPPORT |
755 |
+ if (mb_cur_max > 1 && !using_utf8) |
756 |
+ { |
757 |
+ end = beg + len; |
758 |
+ while (end < buf + size) |
759 |
+ { |
760 |
+ size_t mlen = mbrlen (end, buf + size - end, &mbs); |
761 |
+ if (mlen == (size_t) -1 || mlen == (size_t) -2 || mlen == 0) |
762 |
+ { |
763 |
+ memset (&mbs, '\0', sizeof (mbstate_t)); |
764 |
+ mlen = 1; |
765 |
+ } |
766 |
+ if (mlen == 1 && *end == eol) |
767 |
+ break; |
768 |
+ |
769 |
+ end += mlen; |
770 |
+ } |
771 |
+ } |
772 |
+ else |
773 |
+ #endif /* MBS_SUPPORT */ |
774 |
end = memchr (beg + len, eol, (buf + size) - (beg + len)); |
775 |
end++; |
776 |
while (buf < beg && beg[-1] != eol) |
777 |
@@ -591,15 +1016,6 @@ |
778 |
*match_size = len; |
779 |
ret_val = beg - buf; |
780 |
out: |
781 |
-#ifdef MBS_SUPPORT |
782 |
- if (MB_CUR_MAX > 1) |
783 |
- { |
784 |
- if (match_icase) |
785 |
- free((char*)buf); |
786 |
- if (mb_properties) |
787 |
- free(mb_properties); |
788 |
- } |
789 |
-#endif /* MBS_SUPPORT */ |
790 |
return ret_val; |
791 |
} |
792 |
#endif /* defined(GREP_PROGRAM) || defined(FGREP_PROGRAM) */ |