1 |
/* |
2 |
* Author: Kasun Gajasinghe |
3 |
* E-Mail: kasunbg AT gmail DOT com |
4 |
* Date: 09.08.2010 |
5 |
* |
6 |
* usage: stemmer(word); |
7 |
* ex: var stem = stemmer(foobar); |
8 |
* Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html |
9 |
* |
10 |
* LICENSE: |
11 |
* |
12 |
* Copyright (c) 2010, Kasun Gajasinghe. All rights reserved. |
13 |
* |
14 |
* Redistribution and use in source and binary forms, with or without modification, |
15 |
* are permitted provided that the following conditions are met: |
16 |
* |
17 |
* 1. Redistributions of source code must retain the above copyright notice, |
18 |
* this list of conditions and the following disclaimer. |
19 |
* |
20 |
* 2. Redistributions in binary form must reproduce the above copyright notice, |
21 |
* this list of conditions and the following disclaimer in the documentation |
22 |
* and/or other materials provided with the distribution. |
23 |
* |
24 |
* |
25 |
* THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, |
26 |
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A |
27 |
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT, |
28 |
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
29 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
30 |
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
31 |
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
32 |
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
33 |
* |
34 |
*/ |
35 |
|
36 |
var stemmer = function(word){ |
37 |
// Letters in French include the following accented forms, |
38 |
// â à ç ë é ê è ï î ô û ù |
39 |
// The following letters are vowels: |
40 |
// a e i o u y â à ë é ê è ï î ô û ù |
41 |
|
42 |
word = word.toLowerCase(); |
43 |
var oriWord = word; |
44 |
word = word.replace(/qu/g, 'qU'); //have to perform first, as after the operation, capital U is not treated as a vowel |
45 |
word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2'); |
46 |
word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2'); |
47 |
word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y'); |
48 |
word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1'); |
49 |
|
50 |
var rv=''; |
51 |
var rvIndex = -1; |
52 |
if(word.search(/^(par|col|tap)/) != -1 || word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1){ |
53 |
rv = word.substring(3); |
54 |
rvIndex = 3; |
55 |
} else { |
56 |
rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/); |
57 |
if(rvIndex != -1){ |
58 |
rvIndex +=2; //+2 is to supplement the substring(1) used to find rvIndex |
59 |
rv = word.substring(rvIndex); |
60 |
} else { |
61 |
rvIndex = word.length; |
62 |
} |
63 |
} |
64 |
|
65 |
// R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. |
66 |
// R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel |
67 |
var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/); |
68 |
var r1 = ''; |
69 |
if (r1Index != -1) { |
70 |
r1Index += 2; |
71 |
r1 = word.substring(r1Index); |
72 |
} else { |
73 |
r1Index = word.length; |
74 |
} |
75 |
|
76 |
var r2Index = -1; |
77 |
var r2 = ''; |
78 |
if (r1Index != -1) { |
79 |
r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/); |
80 |
if (r2Index != -1) { |
81 |
r2Index += 2; |
82 |
r2 = r1.substring(r2Index); |
83 |
r2Index += r1Index; |
84 |
} else { |
85 |
r2 = ''; |
86 |
r2Index = word.length; |
87 |
} |
88 |
} |
89 |
if (r1Index != -1 && r1Index < 3) { |
90 |
r1Index = 3; |
91 |
r1 = word.substring(r1Index); |
92 |
} |
93 |
|
94 |
/* |
95 |
Step 1: Standard suffix removal |
96 |
*/ |
97 |
var a1Index = word.search(/(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/); |
98 |
var a2Index = word.search(/(atrice|ateur|ation|atrices|ateurs|ations)$/); |
99 |
var a3Index = word.search(/(logie|logies)$/); |
100 |
var a4Index = word.search(/(usion|ution|usions|utions)$/); |
101 |
var a5Index = word.search(/(ence|ences)$/); |
102 |
var a6Index = word.search(/(ement|ements)$/); |
103 |
var a7Index = word.search(/(ité|ités)$/); |
104 |
var a8Index = word.search(/(if|ive|ifs|ives)$/); |
105 |
var a9Index = word.search(/(eaux)$/); |
106 |
var a10Index = word.search(/(aux)$/); |
107 |
var a11Index = word.search(/(euse|euses)$/); |
108 |
var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement|issements)$/); |
109 |
var a13Index = word.search(/(amment)$/); |
110 |
var a14Index = word.search(/(emment)$/); |
111 |
var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment|ments)$/); |
112 |
|
113 |
if(a1Index != -1 && a1Index >= r2Index){ |
114 |
word = word.substring(0,a1Index); |
115 |
} else if(a2Index != -1 && a2Index >= r2Index){ |
116 |
word = word.substring(0,a2Index); |
117 |
var a2Index2 = word.search(/(ic)$/); |
118 |
if(a2Index2 != -1 && a2Index2 >= r2Index){ |
119 |
word = word.substring(0, a2Index2); //if preceded by ic, delete if in R2, |
120 |
} else { //else replace by iqU |
121 |
word = word.replace(/(ic)$/,'iqU'); |
122 |
} |
123 |
} else if(a3Index != -1 && a3Index >= r2Index){ |
124 |
word = word.replace(/(logie|logies)$/,'log'); //replace with log if in R2 |
125 |
} else if(a4Index != -1 && a4Index >= r2Index){ |
126 |
word = word.replace(/(usion|ution|usions|utions)$/,'u'); //replace with u if in R2 |
127 |
} else if(a5Index != -1 && a5Index >= r2Index){ |
128 |
word = word.replace(/(ence|ences)$/,'ent'); //replace with ent if in R2 |
129 |
} else if(a6Index != -1 && a6Index >= rvIndex){ |
130 |
word = word.substring(0,a6Index); |
131 |
if(word.search(/(iv)$/) >= r2Index){ |
132 |
word = word.replace(/(iv)$/, ''); |
133 |
if(word.search(/(at)$/) >= r2Index){ |
134 |
word = word.replace(/(at)$/, ''); |
135 |
} |
136 |
} else if(word.search(/(eus)$/) != -1){ |
137 |
var a6Index2 = word.search(/(eus)$/); |
138 |
if(a6Index2 >=r2Index){ |
139 |
word = word.substring(0, a6Index2); |
140 |
} else if(a6Index2 >= r1Index){ |
141 |
word = word.substring(0,a6Index2)+"eux"; |
142 |
} |
143 |
} else if(word.search(/(abl|iqU)$/) >= r2Index){ |
144 |
word = word.replace(/(abl|iqU)$/,''); //if preceded by abl or iqU, delete if in R2, |
145 |
} else if(word.search(/(ièr|Ièr)$/) >= rvIndex){ |
146 |
word = word.replace(/(ièr|Ièr)$/,'i'); //if preceded by abl or iqU, delete if in R2, |
147 |
} |
148 |
} else if(a7Index != -1 && a7Index >= r2Index){ |
149 |
word = word.substring(0,a7Index); //delete if in R2 |
150 |
if(word.search(/(abil)$/) != -1){ //if preceded by abil, delete if in R2, else replace by abl, otherwise, |
151 |
var a7Index2 = word.search(/(abil)$/); |
152 |
if(a7Index2 >=r2Index){ |
153 |
word = word.substring(0, a7Index2); |
154 |
} else { |
155 |
word = word.substring(0,a7Index2)+"abl"; |
156 |
} |
157 |
} else if(word.search(/(ic)$/) != -1){ |
158 |
var a7Index3 = word.search(/(ic)$/); |
159 |
if(a7Index3 != -1 && a7Index3 >= r2Index){ |
160 |
word = word.substring(0, a7Index3); //if preceded by ic, delete if in R2, |
161 |
} else { //else replace by iqU |
162 |
word = word.replace(/(ic)$/,'iqU'); |
163 |
} |
164 |
} else if(word.search(/(iv)$/) != r2Index){ |
165 |
word = word.replace(/(iv)$/,''); |
166 |
} |
167 |
} else if(a8Index != -1 && a8Index >= r2Index){ |
168 |
word = word.substring(0,a8Index); |
169 |
if(word.search(/(at)$/) >= r2Index){ |
170 |
word = word.replace(/(at)$/, ''); |
171 |
if(word.search(/(ic)$/) >= r2Index){ |
172 |
word = word.replace(/(ic)$/, ''); |
173 |
} else { word = word.replace(/(ic)$/, 'iqU'); } |
174 |
} |
175 |
} else if(a9Index != -1){ word = word.replace(/(eaux)/,'eau') |
176 |
} else if(a10Index >= r1Index){ word = word.replace(/(aux)/,'al') |
177 |
} else if(a11Index != -1 ){ |
178 |
var a11Index2 = word.search(/(euse|euses)$/); |
179 |
if(a11Index2 >=r2Index){ |
180 |
word = word.substring(0, a11Index2); |
181 |
} else if(a11Index2 >= r1Index){ |
182 |
word = word.substring(0, a11Index2)+"eux"; |
183 |
} |
184 |
} else if(a12Index!=-1 && a12Index>=r1Index){ |
185 |
word = word.substring(0,a12Index+1); //+1- amendment to non-vowel |
186 |
} else if(a13Index!=-1 && a13Index>=rvIndex){ |
187 |
word = word.replace(/(amment)$/,'ant'); |
188 |
} else if(a14Index!=-1 && a14Index>=rvIndex){ |
189 |
word = word.replace(/(emment)$/,'ent'); |
190 |
} else if(a15Index!=-1 && a15Index>=rvIndex){ |
191 |
word = word.substring(0,a15Index+1); |
192 |
} |
193 |
|
194 |
/* Step 2a: Verb suffixes beginning i*/ |
195 |
var wordStep1 = word; |
196 |
var step2aDone = false; |
197 |
if(oriWord == word.toLowerCase() || oriWord.search(/(amment|emment|ment|ments)$/) != -1){ |
198 |
step2aDone = true; |
199 |
var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i; |
200 |
if(word.search(b1Regex) >= rvIndex){ |
201 |
word = word.replace(b1Regex,'$1'); |
202 |
} |
203 |
} |
204 |
|
205 |
/* Step 2b: Other verb suffixes*/ |
206 |
if (step2aDone && wordStep1 == word) { |
207 |
if (word.search(/(ions)$/) >= r2Index) { |
208 |
word = word.replace(/(ions)$/, ''); |
209 |
} else { |
210 |
var b2Regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i; |
211 |
if (word.search(b2Regex) >= rvIndex) { |
212 |
word = word.replace(b2Regex, ''); |
213 |
} else { |
214 |
var b3Regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i; |
215 |
if (word.search(b3Regex) >= rvIndex) { |
216 |
word = word.replace(b3Regex, ''); |
217 |
} else { |
218 |
var b3Regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i; |
219 |
if (word.search(b3Regex2) >= rvIndex) { |
220 |
word = word.replace(b3Regex2, ''); |
221 |
} |
222 |
} |
223 |
} |
224 |
} |
225 |
} |
226 |
|
227 |
if(oriWord != word.toLowerCase()){ |
228 |
/* Step 3 */ |
229 |
var rep = ''; |
230 |
if(word.search(/Y$/) != -1) { |
231 |
word = word.replace(/Y$/, 'i'); |
232 |
} else if(word.search(/ç$/) != -1){ |
233 |
word = word.replace(/ç$/, 'c'); |
234 |
} |
235 |
} else { |
236 |
/* Step 4 */ |
237 |
//If the word ends s, not preceded by a, i, o, u, è or s, delete it. |
238 |
if (word.search(/([^aiouès])s$/) >= rvIndex) { |
239 |
word = word.replace(/([^aiouès])s$/, '$1'); |
240 |
} |
241 |
var e1Index = word.search(/ion$/); |
242 |
if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) { |
243 |
word = word.substring(0, e1Index); |
244 |
} else { |
245 |
var e2Index = word.search(/(ier|ière|Ier|Ière)$/); |
246 |
if (e2Index != -1 && e2Index >= rvIndex) { |
247 |
word = word.substring(0, e2Index) + "i"; |
248 |
} else { |
249 |
if (word.search(/e$/) >= rvIndex) { |
250 |
word = word.replace(/e$/, ''); //delete last e |
251 |
} else if (word.search(/guë$/) >= rvIndex) { |
252 |
word = word.replace(/guë$/, 'gu'); |
253 |
} |
254 |
} |
255 |
} |
256 |
} |
257 |
|
258 |
/* Step 5: Undouble */ |
259 |
//word = word.replace(/(en|on|et|el|eil)(n|t|l)$/,'$1'); |
260 |
word = word.replace(/(en|on)(n)$/,'$1'); |
261 |
word = word.replace(/(ett)$/,'et'); |
262 |
word = word.replace(/(el|eil)(l)$/,'$1'); |
263 |
|
264 |
/* Step 6: Un-accent */ |
265 |
word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/,'e$1'); |
266 |
word = word.toLowerCase(); |
267 |
return word; |
268 |
}; |
269 |
|
270 |
var eqOut = new Array(); |
271 |
var noteqOut = new Array(); |
272 |
var eqCount = 0; |
273 |
/* |
274 |
To test the stemming, create two arrays named "voc" and "COut" which are for vocabualary and the stemmed output. |
275 |
Then add the vocabulary strings and output strings. This method will generate the stemmed output for "voc" and will |
276 |
compare the output with COut. |
277 |
(I used porter's voc and out files and did a regex to convert them to js objects. regex: /");\nvoc.push("/g . This |
278 |
will add strings to voc array such that output would look like: voc.push("foobar"); ) drop me an email for any help. |
279 |
*/ |
280 |
function testFr(){ |
281 |
var start = new Date().getTime(); //execution time |
282 |
eqCount = 0; |
283 |
eqOut = new Array(); |
284 |
noteqOut = new Array(); |
285 |
for(var k=0;k<voc.length;k++){ |
286 |
if(COut[k]==stemmer(voc[k])){ |
287 |
eqCount++; |
288 |
eqOut.push("v: "+voc[k]+" c: "+COut[k]); |
289 |
} else { |
290 |
noteqOut.push(voc[k]+", c: "+COut[k]+" s:"+stemmer(voc[k])); |
291 |
} |
292 |
} |
293 |
var end = new Date().getTime(); //execution time |
294 |
var time = end-start; |
295 |
alert("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms"); |
296 |
//console.log("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms"); |
297 |
} |
298 |
|
299 |
|