search/stemmers/fr_stemmer.js

/*
 * Author: Kasun Gajasinghe
 * E-Mail: kasunbg AT gmail DOT com
 * Date: 09.08.2010
 *
 * usage: stemmer(word);
 * ex: var stem = stemmer(foobar);
 * Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html
 *
 * LICENSE:
 *
 * Copyright (c) 2010, Kasun Gajasinghe. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *    2. Redistributions in binary form must reproduce the above copyright notice,
 *       this list of conditions and the following disclaimer in the documentation
 *       and/or other materials provided with the distribution.
 *
 *
 * THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

var stemmer = function(word){
//    Letters in French include the following accented forms,
//        â   à   ç   ë   é   ê   è   ï   î   ô   û   ù
//    The following letters are vowels:
//        a   e   i   o   u   y   â   à   ë   é   ê   è   ï   î   ô   û   ù
    
    word = word.toLowerCase();
    var oriWord = word;
    word = word.replace(/qu/g, 'qU');   //have to perform first, as after the operation, capital U is not treated as a vowel
    word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2');
    word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2');
    word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y');
    word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1');
 
    var rv='';
    var rvIndex = -1;
    if(word.search(/^(par|col|tap)/) != -1 || word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1){
        rv = word.substring(3);
        rvIndex = 3;
    } else {
        rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/);
        if(rvIndex != -1){
            rvIndex +=2;   //+2 is to supplement the substring(1) used to find rvIndex
            rv = word.substring(rvIndex);
        } else {
            rvIndex = word.length;
        }
    }

//    R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
//    R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
    var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
    var r1 = '';
    if (r1Index != -1) {
        r1Index += 2;
        r1 = word.substring(r1Index);
    } else {
        r1Index = word.length;        
    }

    var r2Index = -1;
    var r2 = '';
    if (r1Index != -1) {
        r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
        if (r2Index != -1) {
            r2Index += 2;
            r2 = r1.substring(r2Index);
            r2Index += r1Index;
        } else {
            r2 = '';
            r2Index = word.length;            
        }
    }
    if (r1Index != -1 && r1Index < 3) {
        r1Index = 3;
        r1 = word.substring(r1Index);
    }

    /*
    Step 1: Standard suffix removal
    */
    var a1Index = word.search(/(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/);
    var a2Index = word.search(/(atrice|ateur|ation|atrices|ateurs|ations)$/);
    var a3Index = word.search(/(logie|logies)$/);
    var a4Index = word.search(/(usion|ution|usions|utions)$/);
    var a5Index = word.search(/(ence|ences)$/);
    var a6Index = word.search(/(ement|ements)$/);
    var a7Index = word.search(/(ité|ités)$/);
    var a8Index = word.search(/(if|ive|ifs|ives)$/);
    var a9Index = word.search(/(eaux)$/);
    var a10Index = word.search(/(aux)$/);
    var a11Index = word.search(/(euse|euses)$/);
    var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement|issements)$/);
    var a13Index = word.search(/(amment)$/);
    var a14Index = word.search(/(emment)$/);
    var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment|ments)$/);

    if(a1Index != -1 && a1Index >= r2Index){
        word = word.substring(0,a1Index);
    } else if(a2Index != -1 && a2Index >= r2Index){
        word = word.substring(0,a2Index);
        var a2Index2 = word.search(/(ic)$/);
        if(a2Index2 != -1 && a2Index2 >= r2Index){
            word = word.substring(0, a2Index2);        //if preceded by ic, delete if in R2,
        } else {                                //else replace by iqU
            word = word.replace(/(ic)$/,'iqU');
        }
    } else if(a3Index != -1 && a3Index >= r2Index){
        word = word.replace(/(logie|logies)$/,'log');  //replace with log if in R2
    } else if(a4Index != -1 && a4Index >= r2Index){
        word = word.replace(/(usion|ution|usions|utions)$/,'u');  //replace with u if in R2
    } else if(a5Index != -1 && a5Index >= r2Index){
        word = word.replace(/(ence|ences)$/,'ent');  //replace with ent if in R2
    } else if(a6Index != -1 && a6Index >= rvIndex){
        word = word.substring(0,a6Index);
        if(word.search(/(iv)$/) >= r2Index){
            word = word.replace(/(iv)$/, '');
            if(word.search(/(at)$/) >= r2Index){
                word = word.replace(/(at)$/, '');
            }
        } else if(word.search(/(eus)$/) != -1){
            var a6Index2 = word.search(/(eus)$/);
            if(a6Index2 >=r2Index){
                word = word.substring(0, a6Index2);    
            } else if(a6Index2 >= r1Index){
                word = word.substring(0,a6Index2)+"eux";
            }
        } else if(word.search(/(abl|iqU)$/) >= r2Index){
            word = word.replace(/(abl|iqU)$/,'');   //if preceded by abl or iqU, delete if in R2,
        } else if(word.search(/(ièr|Ièr)$/) >= rvIndex){
            word = word.replace(/(ièr|Ièr)$/,'i');   //if preceded by abl or iqU, delete if in R2,
        } 
    } else if(a7Index != -1 && a7Index >= r2Index){
        word = word.substring(0,a7Index);   //delete if in R2
        if(word.search(/(abil)$/) != -1){   //if preceded by abil, delete if in R2, else replace by abl, otherwise,
            var a7Index2 = word.search(/(abil)$/);
            if(a7Index2 >=r2Index){
                word = word.substring(0, a7Index2);
            } else {
                word = word.substring(0,a7Index2)+"abl";
            }
        } else if(word.search(/(ic)$/) != -1){
            var a7Index3 = word.search(/(ic)$/);
            if(a7Index3 != -1 && a7Index3 >= r2Index){
                word = word.substring(0, a7Index3);        //if preceded by ic, delete if in R2,
            } else {                                //else replace by iqU
                word = word.replace(/(ic)$/,'iqU');
            }
        } else if(word.search(/(iv)$/) != r2Index){
            word = word.replace(/(iv)$/,'');                        
        }
    } else if(a8Index != -1 && a8Index >= r2Index){
        word = word.substring(0,a8Index);
        if(word.search(/(at)$/) >= r2Index){
            word = word.replace(/(at)$/, '');
            if(word.search(/(ic)$/) >= r2Index){
                word = word.replace(/(ic)$/, '');
            } else { word = word.replace(/(ic)$/, 'iqU'); }
        }
    } else if(a9Index != -1){ word = word.replace(/(eaux)/,'eau')
    } else if(a10Index >= r1Index){ word = word.replace(/(aux)/,'al')
    } else if(a11Index != -1 ){
        var a11Index2 = word.search(/(euse|euses)$/);
        if(a11Index2 >=r2Index){
            word = word.substring(0, a11Index2);
        } else if(a11Index2 >= r1Index){
            word = word.substring(0, a11Index2)+"eux";
        }
    } else if(a12Index!=-1 && a12Index>=r1Index){
        word = word.substring(0,a12Index+1);    //+1- amendment to non-vowel
    } else if(a13Index!=-1 && a13Index>=rvIndex){
        word = word.replace(/(amment)$/,'ant');
    } else if(a14Index!=-1 && a14Index>=rvIndex){
        word = word.replace(/(emment)$/,'ent');
    } else if(a15Index!=-1 && a15Index>=rvIndex){
        word = word.substring(0,a15Index+1);
    }

    /* Step 2a: Verb suffixes beginning i*/
    var wordStep1 = word;
    var step2aDone = false;
    if(oriWord == word.toLowerCase() || oriWord.search(/(amment|emment|ment|ments)$/) != -1){
        step2aDone = true;
        var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i;
        if(word.search(b1Regex) >= rvIndex){
            word = word.replace(b1Regex,'$1');
        }
    }

    /* Step 2b:  Other verb suffixes*/
    if (step2aDone && wordStep1 == word) {
        if (word.search(/(ions)$/) >= r2Index) {
            word = word.replace(/(ions)$/, '');
        } else {
            var b2Regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i;
            if (word.search(b2Regex) >= rvIndex) {
                word = word.replace(b2Regex, '');
            } else {
                var b3Regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
                if (word.search(b3Regex) >= rvIndex) {
                    word = word.replace(b3Regex, '');
                } else {
                    var b3Regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i;
                    if (word.search(b3Regex2) >= rvIndex) {
                        word = word.replace(b3Regex2, '');
                    }
                }
            }
        }
    }
    
    if(oriWord != word.toLowerCase()){
        /* Step 3 */
        var rep = '';
        if(word.search(/Y$/) != -1) {
            word = word.replace(/Y$/, 'i');
        } else if(word.search(/ç$/) != -1){
            word = word.replace(/ç$/, 'c');
        }
    } else {
        /* Step 4 */
        //If the word ends s, not preceded by a, i, o, u, è or s, delete it.
        if (word.search(/([^aiouès])s$/) >= rvIndex) {
            word = word.replace(/([^aiouès])s$/, '$1');
        }
        var e1Index = word.search(/ion$/);
        if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) {
            word = word.substring(0, e1Index);
        } else {
            var e2Index = word.search(/(ier|ière|Ier|Ière)$/);
            if (e2Index != -1 && e2Index >= rvIndex) {
                word = word.substring(0, e2Index) + "i";
            } else {
                if (word.search(/e$/) >= rvIndex) {
                    word = word.replace(/e$/, '');   //delete last e
                } else if (word.search(/guë$/) >= rvIndex) {
                    word = word.replace(/guë$/, 'gu');
                }
            }
        }
    }
    
    /* Step 5: Undouble */
    //word = word.replace(/(en|on|et|el|eil)(n|t|l)$/,'$1');
    word = word.replace(/(en|on)(n)$/,'$1');
    word = word.replace(/(ett)$/,'et');
    word = word.replace(/(el|eil)(l)$/,'$1');

    /* Step 6: Un-accent */
    word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/,'e$1');
    word = word.toLowerCase();
    return word;
};

var eqOut = new Array();
var noteqOut = new Array();
var eqCount = 0;
/*
To test the stemming, create two arrays named "voc" and "COut" which are for vocabualary and the stemmed output.
Then add the vocabulary strings and output strings. This method will generate the stemmed output for "voc" and will
compare the output with COut.
 (I used porter's voc and out files and did a regex to convert them to js objects. regex: /");\nvoc.push("/g . This
  will add strings to voc array such that output would look like: voc.push("foobar"); ) drop me an email for any help.
 */
function testFr(){
    var start = new Date().getTime(); //execution time
    eqCount = 0;
    eqOut = new Array();
    noteqOut = new Array();
    for(var k=0;k<voc.length;k++){
        if(COut[k]==stemmer(voc[k])){
            eqCount++;
            eqOut.push("v: "+voc[k]+" c: "+COut[k]);    
        } else {
            noteqOut.push(voc[k]+", c: "+COut[k]+" s:"+stemmer(voc[k]));
        }
    }
    var end = new Date().getTime(); //execution time
    var time = end-start;
    alert("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
    //console.log("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
}


1	/*
2	* Author: Kasun Gajasinghe
3	* E-Mail: kasunbg AT gmail DOT com
4	* Date: 09.08.2010
5	*
6	* usage: stemmer(word);
7	* ex: var stem = stemmer(foobar);
8	* Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html
9	*
10	* LICENSE:
11	*
12	* Copyright (c) 2010, Kasun Gajasinghe. All rights reserved.
13	*
14	* Redistribution and use in source and binary forms, with or without modification,
15	* are permitted provided that the following conditions are met:
16	*
17	* 1. Redistributions of source code must retain the above copyright notice,
18	* this list of conditions and the following disclaimer.
19	*
20	* 2. Redistributions in binary form must reproduce the above copyright notice,
21	* this list of conditions and the following disclaimer in the documentation
22	* and/or other materials provided with the distribution.
23	*
24	*
25	* THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
26	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
27	* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT,
28	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
30	* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
32	* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33	*
34	*/
35
36	var stemmer = function(word){
37	// Letters in French include the following accented forms,
38	// â à ç ë é ê è ï î ô û ù
39	// The following letters are vowels:
40	// a e i o u y â à ë é ê è ï î ô û ù
41
42	word = word.toLowerCase();
43	var oriWord = word;
44	word = word.replace(/qu/g, 'qU'); //have to perform first, as after the operation, capital U is not treated as a vowel
45	word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2');
46	word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2');
47	word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y');
48	word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1');
49
50	var rv='';
51	var rvIndex = -1;
52	if(word.search(/^(par\|col\|tap)/) != -1 \|\| word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1){
53	rv = word.substring(3);
54	rvIndex = 3;
55	} else {
56	rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/);
57	if(rvIndex != -1){
58	rvIndex +=2; //+2 is to supplement the substring(1) used to find rvIndex
59	rv = word.substring(rvIndex);
60	} else {
61	rvIndex = word.length;
62	}
63	}
64
65	// R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel.
66	// R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel
67	var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
68	var r1 = '';
69	if (r1Index != -1) {
70	r1Index += 2;
71	r1 = word.substring(r1Index);
72	} else {
73	r1Index = word.length;
74	}
75
76	var r2Index = -1;
77	var r2 = '';
78	if (r1Index != -1) {
79	r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/);
80	if (r2Index != -1) {
81	r2Index += 2;
82	r2 = r1.substring(r2Index);
83	r2Index += r1Index;
84	} else {
85	r2 = '';
86	r2Index = word.length;
87	}
88	}
89	if (r1Index != -1 && r1Index < 3) {
90	r1Index = 3;
91	r1 = word.substring(r1Index);
92	}
93
94	/*
95	Step 1: Standard suffix removal
96	*/
97	var a1Index = word.search(/(ance\|iqUe\|isme\|able\|iste\|eux\|ances\|iqUes\|ismes\|ables\|istes)$/);
98	var a2Index = word.search(/(atrice\|ateur\|ation\|atrices\|ateurs\|ations)$/);
99	var a3Index = word.search(/(logie\|logies)$/);
100	var a4Index = word.search(/(usion\|ution\|usions\|utions)$/);
101	var a5Index = word.search(/(ence\|ences)$/);
102	var a6Index = word.search(/(ement\|ements)$/);
103	var a7Index = word.search(/(ité\|ités)$/);
104	var a8Index = word.search(/(if\|ive\|ifs\|ives)$/);
105	var a9Index = word.search(/(eaux)$/);
106	var a10Index = word.search(/(aux)$/);
107	var a11Index = word.search(/(euse\|euses)$/);
108	var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement\|issements)$/);
109	var a13Index = word.search(/(amment)$/);
110	var a14Index = word.search(/(emment)$/);
111	var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment\|ments)$/);
112
113	if(a1Index != -1 && a1Index >= r2Index){
114	word = word.substring(0,a1Index);
115	} else if(a2Index != -1 && a2Index >= r2Index){
116	word = word.substring(0,a2Index);
117	var a2Index2 = word.search(/(ic)$/);
118	if(a2Index2 != -1 && a2Index2 >= r2Index){
119	word = word.substring(0, a2Index2); //if preceded by ic, delete if in R2,
120	} else { //else replace by iqU
121	word = word.replace(/(ic)$/,'iqU');
122	}
123	} else if(a3Index != -1 && a3Index >= r2Index){
124	word = word.replace(/(logie\|logies)$/,'log'); //replace with log if in R2
125	} else if(a4Index != -1 && a4Index >= r2Index){
126	word = word.replace(/(usion\|ution\|usions\|utions)$/,'u'); //replace with u if in R2
127	} else if(a5Index != -1 && a5Index >= r2Index){
128	word = word.replace(/(ence\|ences)$/,'ent'); //replace with ent if in R2
129	} else if(a6Index != -1 && a6Index >= rvIndex){
130	word = word.substring(0,a6Index);
131	if(word.search(/(iv)$/) >= r2Index){
132	word = word.replace(/(iv)$/, '');
133	if(word.search(/(at)$/) >= r2Index){
134	word = word.replace(/(at)$/, '');
135	}
136	} else if(word.search(/(eus)$/) != -1){
137	var a6Index2 = word.search(/(eus)$/);
138	if(a6Index2 >=r2Index){
139	word = word.substring(0, a6Index2);
140	} else if(a6Index2 >= r1Index){
141	word = word.substring(0,a6Index2)+"eux";
142	}
143	} else if(word.search(/(abl\|iqU)$/) >= r2Index){
144	word = word.replace(/(abl\|iqU)$/,''); //if preceded by abl or iqU, delete if in R2,
145	} else if(word.search(/(ièr\|Ièr)$/) >= rvIndex){
146	word = word.replace(/(ièr\|Ièr)$/,'i'); //if preceded by abl or iqU, delete if in R2,
147	}
148	} else if(a7Index != -1 && a7Index >= r2Index){
149	word = word.substring(0,a7Index); //delete if in R2
150	if(word.search(/(abil)$/) != -1){ //if preceded by abil, delete if in R2, else replace by abl, otherwise,
151	var a7Index2 = word.search(/(abil)$/);
152	if(a7Index2 >=r2Index){
153	word = word.substring(0, a7Index2);
154	} else {
155	word = word.substring(0,a7Index2)+"abl";
156	}
157	} else if(word.search(/(ic)$/) != -1){
158	var a7Index3 = word.search(/(ic)$/);
159	if(a7Index3 != -1 && a7Index3 >= r2Index){
160	word = word.substring(0, a7Index3); //if preceded by ic, delete if in R2,
161	} else { //else replace by iqU
162	word = word.replace(/(ic)$/,'iqU');
163	}
164	} else if(word.search(/(iv)$/) != r2Index){
165	word = word.replace(/(iv)$/,'');
166	}
167	} else if(a8Index != -1 && a8Index >= r2Index){
168	word = word.substring(0,a8Index);
169	if(word.search(/(at)$/) >= r2Index){
170	word = word.replace(/(at)$/, '');
171	if(word.search(/(ic)$/) >= r2Index){
172	word = word.replace(/(ic)$/, '');
173	} else { word = word.replace(/(ic)$/, 'iqU'); }
174	}
175	} else if(a9Index != -1){ word = word.replace(/(eaux)/,'eau')
176	} else if(a10Index >= r1Index){ word = word.replace(/(aux)/,'al')
177	} else if(a11Index != -1 ){
178	var a11Index2 = word.search(/(euse\|euses)$/);
179	if(a11Index2 >=r2Index){
180	word = word.substring(0, a11Index2);
181	} else if(a11Index2 >= r1Index){
182	word = word.substring(0, a11Index2)+"eux";
183	}
184	} else if(a12Index!=-1 && a12Index>=r1Index){
185	word = word.substring(0,a12Index+1); //+1- amendment to non-vowel
186	} else if(a13Index!=-1 && a13Index>=rvIndex){
187	word = word.replace(/(amment)$/,'ant');
188	} else if(a14Index!=-1 && a14Index>=rvIndex){
189	word = word.replace(/(emment)$/,'ent');
190	} else if(a15Index!=-1 && a15Index>=rvIndex){
191	word = word.substring(0,a15Index+1);
192	}
193
194	/* Step 2a: Verb suffixes beginning i*/
195	var wordStep1 = word;
196	var step2aDone = false;
197	if(oriWord == word.toLowerCase() \|\| oriWord.search(/(amment\|emment\|ment\|ments)$/) != -1){
198	step2aDone = true;
199	var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes\|ît\|îtes\|i\|ie\|ies\|ir\|ira\|irai\|iraIent\|irais\|irait\|iras\|irent\|irez\|iriez\|irions\|irons\|iront\|is\|issaIent\|issais\|issait\|issant\|issante\|issantes\|issants\|isse\|issent\|isses\|issez\|issiez\|issions\|issons\|it)$/i;
200	if(word.search(b1Regex) >= rvIndex){
201	word = word.replace(b1Regex,'$1');
202	}
203	}
204
205	/* Step 2b: Other verb suffixes*/
206	if (step2aDone && wordStep1 == word) {
207	if (word.search(/(ions)$/) >= r2Index) {
208	word = word.replace(/(ions)$/, '');
209	} else {
210	var b2Regex = /(é\|ée\|ées\|és\|èrent\|er\|era\|erai\|eraIent\|erais\|erait\|eras\|erez\|eriez\|erions\|erons\|eront\|ez\|iez)$/i;
211	if (word.search(b2Regex) >= rvIndex) {
212	word = word.replace(b2Regex, '');
213	} else {
214	var b3Regex = /e(âmes\|ât\|âtes\|a\|ai\|aIent\|ais\|ait\|ant\|ante\|antes\|ants\|as\|asse\|assent\|asses\|assiez\|assions)$/i;
215	if (word.search(b3Regex) >= rvIndex) {
216	word = word.replace(b3Regex, '');
217	} else {
218	var b3Regex2 = /(âmes\|ât\|âtes\|a\|ai\|aIent\|ais\|ait\|ant\|ante\|antes\|ants\|as\|asse\|assent\|asses\|assiez\|assions)$/i;
219	if (word.search(b3Regex2) >= rvIndex) {
220	word = word.replace(b3Regex2, '');
221	}
222	}
223	}
224	}
225	}
226
227	if(oriWord != word.toLowerCase()){
228	/* Step 3 */
229	var rep = '';
230	if(word.search(/Y$/) != -1) {
231	word = word.replace(/Y$/, 'i');
232	} else if(word.search(/ç$/) != -1){
233	word = word.replace(/ç$/, 'c');
234	}
235	} else {
236	/* Step 4 */
237	//If the word ends s, not preceded by a, i, o, u, è or s, delete it.
238	if (word.search(/([^aiouès])s$/) >= rvIndex) {
239	word = word.replace(/([^aiouès])s$/, '$1');
240	}
241	var e1Index = word.search(/ion$/);
242	if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) {
243	word = word.substring(0, e1Index);
244	} else {
245	var e2Index = word.search(/(ier\|ière\|Ier\|Ière)$/);
246	if (e2Index != -1 && e2Index >= rvIndex) {
247	word = word.substring(0, e2Index) + "i";
248	} else {
249	if (word.search(/e$/) >= rvIndex) {
250	word = word.replace(/e$/, ''); //delete last e
251	} else if (word.search(/guë$/) >= rvIndex) {
252	word = word.replace(/guë$/, 'gu');
253	}
254	}
255	}
256	}
257
258	/* Step 5: Undouble */
259	//word = word.replace(/(en\|on\|et\|el\|eil)(n\|t\|l)$/,'$1');
260	word = word.replace(/(en\|on)(n)$/,'$1');
261	word = word.replace(/(ett)$/,'et');
262	word = word.replace(/(el\|eil)(l)$/,'$1');
263
264	/* Step 6: Un-accent */
265	word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/,'e$1');
266	word = word.toLowerCase();
267	return word;
268	};
269
270	var eqOut = new Array();
271	var noteqOut = new Array();
272	var eqCount = 0;
273	/*
274	To test the stemming, create two arrays named "voc" and "COut" which are for vocabualary and the stemmed output.
275	Then add the vocabulary strings and output strings. This method will generate the stemmed output for "voc" and will
276	compare the output with COut.
277	(I used porter's voc and out files and did a regex to convert them to js objects. regex: /");\nvoc.push("/g . This
278	will add strings to voc array such that output would look like: voc.push("foobar"); ) drop me an email for any help.
279	*/
280	function testFr(){
281	var start = new Date().getTime(); //execution time
282	eqCount = 0;
283	eqOut = new Array();
284	noteqOut = new Array();
285	for(var k=0;k<voc.length;k++){
286	if(COut[k]==stemmer(voc[k])){
287	eqCount++;
288	eqOut.push("v: "+voc[k]+" c: "+COut[k]);
289	} else {
290	noteqOut.push(voc[k]+", c: "+COut[k]+" s:"+stemmer(voc[k]));
291	}
292	}
293	var end = new Date().getTime(); //execution time
294	var time = end-start;
295	alert("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
296	//console.log("equal count= "+eqCount+" out of "+voc.length+" words. time= "+time+" ms");
297	}
298
299