MBG SQLite
Not logged in

FTS3 Snowball Stemmer

As speak wikipage http://en.wikipedia.org/wiki/Stemming:

In linguistic morphology, stemming is the process for reducing inflected (or sometimes derived) words to their stem, base or root form – generally a written word form. The stem need not be identical to the morphological root of the word; it is usually sufficient that related words map to the same stem, even if this stem is not in itself a valid root. Algorithms for stemming have been studied in computer science since 1968. Many search engines treat words with the same stem as synonyms as a kind of query broadening, a process called conflation.

Stemming programs are commonly referred to as stemming algorithms or stemmers.

Snowball is the non-dictionary multilanguage stemmer. As example, see details about russian stemmer: Russian stemming algorithm

FTS3 Snowball example

The "russian" collation is added by check-in [82b339b938]. The Snowball stemmer support added by [b7837f1bb9], and debian package files updated by [f1b99fe082].

CREATE VIRTUAL TABLE fts USING fts4(text,TOKENIZE icu russian);
insert into fts (text) values ('Нафига');
insert into fts (text) values ('попу');
insert into fts (text) values ('наган');
insert into fts (text) values ('если');
insert into fts (text) values ('поп');
insert into fts (text) values ('не');
insert into fts (text) values ('хулиган');

select * from fts where fts match 'поп';
    попу
    поп
select * from fts where fts match 'не';
    не
select * from fts where fts match 'нафиг';
    Нафига

Note: "russian" is stemmer name, not collation:

pragma collation_list;
    1|NOCASE
    2|RTRIM
    3|BINARY

Snowball example

/* This is a simple program which uses libstemmer to provide a command
 * line interface for stemming using any of the algorithms provided.
 */

#include <stdio.h>
#include <stdlib.h> /* for malloc, free */
#include <string.h> /* for memmove */

#include "libstemmer.h"

static void
stem_file(struct sb_stemmer * stemmer)
{
#define INC 10
    int lim = INC;
    sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));

    while(1) {
        int ch = getc(stdin);
        if (ch == EOF) {
            free(b); return;
        }
		int i = 0;
	    int inlen = 0;
		while(1) {
			if (ch == '\n' || ch == EOF) break;
			if (i == lim) {
				sb_symbol * newb;
				newb = (sb_symbol *)
				realloc(b, (lim + INC) * sizeof(sb_symbol));
				if (newb == 0) goto error;
				b = newb;
				lim = lim + INC;
			}
			/* Update count of utf-8 characters. */
			if (ch < 0x80 || ch > 0xBF) inlen += 1;
			b[i] = ch;
			i++;
			ch = getc(stdin);
		}

		const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
		//fprintf(stdout,"%d\n",sb_stemmer_length(stemmer));
		if (stemmed == NULL)
		{
			fprintf(stderr, "Out of memory");
			exit(1);
		} else {
		    fputs((char *)stemmed, stdout);
		    putc('\n', stdout);
		}
    }
error:
    if (b != 0) free(b);
    return;
}

int
main(int argc, char * argv[])
{
    struct sb_stemmer * stemmer;
    char * language = "russian";

    /* do the stemming process: */
    stemmer = sb_stemmer_new(language, NULL);
    if (stemmer == 0) {
        fprintf(stderr, "language `%s' not available for stemming\n", language);
        exit(1);
    }
    stem_file(stemmer);
    sb_stemmer_delete(stemmer);

    return 0;
}

Run the example as

gcc stemwords.c -I ../include/ -L. -lstemmer -o stemwords
echo -e "Нафига\nпопу\nнаган\nесли\nпоп\nне\nхулиган"|./stemwords
Нафиг
поп
нага
есл
поп
не
хулига