|
1 diff -r -u sed-4.2.1.orig/lib/regcomp.c sed-4.2.1.patched/lib/regcomp.c |
|
2 --- sed-4.2.1.orig/lib/regcomp.c Wed Jun 3 12:10:51 2009 |
|
3 +++ sed-4.2.1.patched/lib/regcomp.c Mon Jul 27 11:09:06 2009 |
|
4 @@ -32,8 +32,10 @@ |
|
5 static void free_workarea_compile (regex_t *preg); |
|
6 static reg_errcode_t create_initial_state (re_dfa_t *dfa); |
|
7 #ifdef RE_ENABLE_I18N |
|
8 +#ifdef USE_UTF8_OPTIMIZATION |
|
9 static void optimize_utf8 (re_dfa_t *dfa); |
|
10 #endif |
|
11 +#endif |
|
12 static reg_errcode_t analyze (regex_t *preg); |
|
13 static reg_errcode_t preorder (bin_tree_t *root, |
|
14 reg_errcode_t (fn (void *, bin_tree_t *)), |
|
15 @@ -642,7 +644,9 @@ |
|
16 } |
|
17 re_free (dfa->state_table); |
|
18 #ifdef RE_ENABLE_I18N |
|
19 +#ifdef USE_UTF8_OPTIMIZATION |
|
20 if (dfa->sb_char != utf8_sb_map) |
|
21 +#endif |
|
22 re_free (dfa->sb_char); |
|
23 #endif |
|
24 re_free (dfa->subexp_map); |
|
25 @@ -823,10 +827,12 @@ |
|
26 goto re_compile_internal_free_return; |
|
27 |
|
28 #ifdef RE_ENABLE_I18N |
|
29 +#ifdef USE_UTF8_OPTIMIZATION |
|
30 /* If possible, do searching in single byte encoding to speed things up. */ |
|
31 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) |
|
32 optimize_utf8 (dfa); |
|
33 #endif |
|
34 +#endif |
|
35 |
|
36 /* Then create the initial state of the dfa. */ |
|
37 err = create_initial_state (dfa); |
|
38 @@ -889,14 +895,18 @@ |
|
39 |
|
40 dfa->mb_cur_max = MB_CUR_MAX; |
|
41 #ifdef _LIBC |
|
42 +#ifdef USE_UTF8_OPTIMIZATION |
|
43 if (dfa->mb_cur_max == 6 |
|
44 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) |
|
45 dfa->is_utf8 = 1; |
|
46 +#endif /* USE_UTF8_OPTIMIZATION */ |
|
47 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) |
|
48 != 0); |
|
49 #else |
|
50 +#ifdef USE_UTF8_OPTIMIZATION |
|
51 if (strcmp (locale_charset (), "UTF-8") == 0) |
|
52 dfa->is_utf8 = 1; |
|
53 +#endif /* USE_UTF8_OPTIMIZATION */ |
|
54 |
|
55 /* We check exhaustively in the loop below if this charset is a |
|
56 superset of ASCII. */ |
|
57 @@ -906,9 +916,11 @@ |
|
58 #ifdef RE_ENABLE_I18N |
|
59 if (dfa->mb_cur_max > 1) |
|
60 { |
|
61 +#ifdef USE_UTF8_OPTIMIZATION |
|
62 if (dfa->is_utf8) |
|
63 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; |
|
64 else |
|
65 +#endif /* USE_UTF8_OPTIMIZATION */ |
|
66 { |
|
67 int i, j, ch; |
|
68 |
|
69 @@ -1128,7 +1140,9 @@ |
|
70 |
|
71 /* The search can be in single byte locale. */ |
|
72 dfa->mb_cur_max = 1; |
|
73 +#ifdef USE_UTF8_OPTIMIZATION |
|
74 dfa->is_utf8 = 0; |
|
75 +#endif |
|
76 dfa->has_mb_node = dfa->nbackref > 0 || has_period; |
|
77 } |
|
78 #endif |
|
79 diff -r -u sed-4.2.1.orig/lib/regex_internal.c sed-4.2.1.patched/lib/regex_internal.c |
|
80 --- sed-4.2.1.orig/lib/regex_internal.c Wed Jun 3 12:10:51 2009 |
|
81 +++ sed-4.2.1.patched/lib/regex_internal.c Mon Jul 27 10:52:30 2009 |
|
82 @@ -181,7 +181,9 @@ |
|
83 pstr->icase = icase; |
|
84 pstr->mbs_allocated = (trans != NULL || icase); |
|
85 pstr->mb_cur_max = dfa->mb_cur_max; |
|
86 +#ifdef USE_UTF8_OPTIMIZATION |
|
87 pstr->is_utf8 = dfa->is_utf8; |
|
88 +#endif /* USE_UTF8_OPTIMIZATION */ |
|
89 pstr->map_notascii = dfa->map_notascii; |
|
90 pstr->stop = pstr->len; |
|
91 pstr->raw_stop = pstr->stop; |
|
92 @@ -707,6 +709,7 @@ |
|
93 Idx wcs_idx; |
|
94 wint_t wc = WEOF; |
|
95 |
|
96 +#ifdef USE_UTF8_OPTIMIZATION |
|
97 if (pstr->is_utf8) |
|
98 { |
|
99 const unsigned char *raw, *p, *end; |
|
100 @@ -760,6 +763,7 @@ |
|
101 break; |
|
102 } |
|
103 } |
|
104 +#endif /* USE_UTF8_OPTIMIZATION */ |
|
105 |
|
106 if (wc == WEOF) |
|
107 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx; |
|
108 diff -r -u sed-4.2.1.orig/lib/regex_internal.h sed-4.2.1.patched/lib/regex_internal.h |
|
109 --- sed-4.2.1.orig/lib/regex_internal.h Wed Jun 3 12:10:51 2009 |
|
110 +++ sed-4.2.1.patched/lib/regex_internal.h Mon Jul 27 10:52:30 2009 |
|
111 @@ -406,7 +406,9 @@ |
|
112 re_const_bitset_ptr_t word_char; |
|
113 /* true if REG_ICASE. */ |
|
114 unsigned char icase; |
|
115 +#ifdef USE_UTF8_OPTIMIZATION |
|
116 unsigned char is_utf8; |
|
117 +#endif /* USE_UTF8_OPTIMIZATION */ |
|
118 unsigned char map_notascii; |
|
119 unsigned char mbs_allocated; |
|
120 unsigned char offsets_needed; |
|
121 @@ -690,7 +692,9 @@ |
|
122 a node which can accept multibyte character or multi character |
|
123 collating element. */ |
|
124 unsigned int has_mb_node : 1; |
|
125 +#ifdef USE_UTF8_OPTIMIZATION |
|
126 unsigned int is_utf8 : 1; |
|
127 +#endif /* USE_UTF8_OPTIMIZATION */ |
|
128 unsigned int map_notascii : 1; |
|
129 unsigned int word_ops_used : 1; |
|
130 int mb_cur_max; |