Re: [PROPOSAL] Nepali Snowball dictionary
Hello all, On Wed, Feb 28, 2018 at 11:16:24AM +0300, Arthur Zakirov wrote: > I've sent a pull request with nepali snowball algorithm into > https://github.com/snowballstem [1]. They aren't againts the patch. > > They haven't merged it yet, though. There are some problems with > continuous testing via Travis CI which aren't related with the patch and > require fix some scripts. The patch with nepali stemmer was applied into the Snowball upstream [1]. I attached second version of the patch. Nepali stemmer was generated by latest Snowball compiler (commit 3c3d3953e8174b55e86aedd89544ea4e5d76db78). I will add new commitfest entry. Authors: - Ingroj Shrestha, Nepali NLP Group - Oleg Bartunov, Postgres Professional Ltd. - Shreeya Singh Dhakal, Nepali NLP Group 1 - https://github.com/snowballstem/snowball/pull/70 -- Arthur Zakirov Postgres Professional: http://www.postgrespro.com Russian Postgres Company diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 8075ea94e7..61280b03fc 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -3803,6 +3803,7 @@ Parser: "pg_catalog.default" pg_catalog | german_stem | snowball stemmer for german language pg_catalog | hungarian_stem | snowball stemmer for hungarian language pg_catalog | italian_stem| snowball stemmer for italian language + pg_catalog | nepali_stem | snowball stemmer for nepali language pg_catalog | norwegian_stem | snowball stemmer for norwegian language pg_catalog | portuguese_stem | snowball stemmer for portuguese language pg_catalog | romanian_stem | snowball stemmer for romanian language diff --git a/src/backend/snowball/Makefile b/src/backend/snowball/Makefile index 50cbace41d..c29f4184f2 100644 --- a/src/backend/snowball/Makefile +++ b/src/backend/snowball/Makefile @@ -40,6 +40,7 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \ stem_UTF_8_german.o \ stem_UTF_8_hungarian.o \ stem_UTF_8_italian.o \ + stem_UTF_8_nepali.o \ stem_UTF_8_norwegian.o \ stem_UTF_8_porter.o \ stem_UTF_8_portuguese.o \ @@ -62,6 +63,7 @@ LANGUAGES= \ german german \ hungarian hungarian \ italian italian \ + nepali nepali \ norwegian norwegian \ portuguese portuguese \ romanianromanian\ diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c index 78c9f73ef0..d96c849118 100644 --- a/src/backend/snowball/dict_snowball.c +++ b/src/backend/snowball/dict_snowball.c @@ -49,6 +49,7 @@ #include "snowball/libstemmer/stem_UTF_8_german.h" #include "snowball/libstemmer/stem_UTF_8_hungarian.h" #include "snowball/libstemmer/stem_UTF_8_italian.h" +#include "snowball/libstemmer/stem_UTF_8_nepali.h" #include "snowball/libstemmer/stem_UTF_8_norwegian.h" #include "snowball/libstemmer/stem_UTF_8_porter.h" #include "snowball/libstemmer/stem_UTF_8_portuguese.h" @@ -102,6 +103,7 @@ static const stemmer_module stemmer_modules[] = {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, + {"nepali", PG_UTF8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem}, {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}, {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, diff --git a/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c b/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c new file mode 100644 index 00..d1c1be76f3 --- /dev/null +++ b/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c @@ -0,0 +1,424 @@ +/* This file was generated automatically by the Snowball to ISO C compiler */ +/* http://snowballstem.org/ */ + +#include "header.h" + +#ifdef __cplusplus +extern "C" { +#endif +extern int nepali_UTF_8_stem(struct SN_env * z); +#ifdef __cplusplus +} +#endif +static int r_remove_category_3(struct SN_env * z); +static int r_remove_category_2(struct SN_env * z); +static int r_check_category_2(struct SN_env * z); +static int r_remove_category_1(struct SN_env * z); +#ifdef __cplusplus +extern "C" { +#endif + + +extern struct SN_env * nepali_UTF_8_create_env(void); +extern void nepali_UTF_8_close_env(struct SN_env * z); + + +#ifdef __cplusplus +} +#endif +static const symbol s_0_0[6] = { 0xE0, 0xA4, 0x95, 0xE0, 0xA5, 0x80 }; +static const symbol s_0_1[9] = { 0xE0, 0xA4, 0xB2, 0xE0, 0xA4, 0xBE, 0xE0, 0xA4, 0x87 }; +static const symbol s_0_2[6] = { 0xE0, 0xA4, 0xB2, 0xE0, 0x
Re: [PROPOSAL] Nepali Snowball dictionary
On Thu, Mar 01, 2018 at 10:23:11PM -0800, Andres Freund wrote: > What is that entry for, if I may ask? We need to wait for them to merge > it, then sync the snowball code, including the nepali dictionary. This > doesn't realistically seem doable for this commitfest. Therefore I think > this should be marked as 'returned with feedback'. I understand the point. I marked the patch as 'Returned with feedback' by myself. -- Arthur Zakirov Postgres Professional: http://www.postgrespro.com Russian Postgres Company
Re: [PROPOSAL] Nepali Snowball dictionary
Hi, On 2018-02-28 11:16:24 +0300, Arthur Zakirov wrote: > I've created the commitfest entry https://commitfest.postgresql.org/17/1569/ What is that entry for, if I may ask? We need to wait for them to merge it, then sync the snowball code, including the nepali dictionary. This doesn't realistically seem doable for this commitfest. Therefore I think this should be marked as 'returned with feedback'. Greetings, Andres Freund
Re: [PROPOSAL] Nepali Snowball dictionary
On Tue, Feb 20, 2018 at 12:01:30AM +0300, Arthur Zakirov wrote: > > As best I know, the original list > > http://lists.tartarus.org/mailman/listinfo/snowball-discuss > > is moribund, but there's a fork at > > http://snowballstem.org > > that has at least some activity. > > From the original list it seems that http://snowballstem.org is > frozen. But development work continues at > https://github.com/snowballstem by other people. > I'll try to send them a pull request. I've sent a pull request with nepali snowball algorithm into https://github.com/snowballstem [1]. They aren't againts the patch. They haven't merged it yet, though. There are some problems with continuous testing via Travis CI which aren't related with the patch and require fix some scripts. I've created the commitfest entry https://commitfest.postgresql.org/17/1569/ 1 - https://github.com/snowballstem/snowball/pull/69 -- Arthur Zakirov Postgres Professional: http://www.postgrespro.com Russian Postgres Company
Re: [PROPOSAL] Nepali Snowball dictionary
On Tue, Feb 20, 2018 at 12:01 AM, Arthur Zakirov wrote: > Thank you for your answer! > > 2018-02-19 18:43 GMT+03:00 Tom Lane : >> We are not the upstream for the snowball stuff, and lack the expertise >> to decide whether proposed changes are any good. To get anything >> changed there, you'd have to get it approved by the snowball group. >> >> As best I know, the original list >> http://lists.tartarus.org/mailman/listinfo/snowball-discuss >> is moribund, but there's a fork at >> http://snowballstem.org >> that has at least some activity. > > From the original list it seems that http://snowballstem.org is > frozen. But development work continues at > https://github.com/snowballstem by other people. > I'll try to send them a pull request. > >> Probably the first step ought to involve syncing our copy with the >> current state of that upstream, something that's not been done in a >> very long time :-( > > I think I will try to sync snowball dictionaries with snowballstem.org > algorithms, it may be useful. Or maybe it is better to sync with the > github repository. I don't aware how they differ yet, though. That may be dangerous ! > > -- > Arthur Zakirov > Postgres Professional: http://www.postgrespro.com > Russian Postgres Company >
Re: [PROPOSAL] Nepali Snowball dictionary
Thank you for your answer! 2018-02-19 18:43 GMT+03:00 Tom Lane : > We are not the upstream for the snowball stuff, and lack the expertise > to decide whether proposed changes are any good. To get anything > changed there, you'd have to get it approved by the snowball group. > > As best I know, the original list > http://lists.tartarus.org/mailman/listinfo/snowball-discuss > is moribund, but there's a fork at > http://snowballstem.org > that has at least some activity. >From the original list it seems that http://snowballstem.org is frozen. But development work continues at https://github.com/snowballstem by other people. I'll try to send them a pull request. > Probably the first step ought to involve syncing our copy with the > current state of that upstream, something that's not been done in a > very long time :-( I think I will try to sync snowball dictionaries with snowballstem.org algorithms, it may be useful. Or maybe it is better to sync with the github repository. I don't aware how they differ yet, though. -- Arthur Zakirov Postgres Professional: http://www.postgrespro.com Russian Postgres Company
Re: [PROPOSAL] Nepali Snowball dictionary
Arthur Zakirov writes: > Is it appropriate to add new snowball dictionaries? I'm not sure about > policy of including new snowball dictionaries. We are not the upstream for the snowball stuff, and lack the expertise to decide whether proposed changes are any good. To get anything changed there, you'd have to get it approved by the snowball group. As best I know, the original list http://lists.tartarus.org/mailman/listinfo/snowball-discuss is moribund, but there's a fork at http://snowballstem.org that has at least some activity. Probably the first step ought to involve syncing our copy with the current state of that upstream, something that's not been done in a very long time :-( regards, tom lane
[PROPOSAL] Nepali Snowball dictionary
Hello hackers, I would like to propose nepali snowball dictionary patch. Nepali is inflectional and derivational language. And it can be stemmed. initdb also patched, so it can determine default text search configuration. Examples: =# select ts_lexize('nepali_stem', 'लेख्'); ts_lexize --- {लेख्} =# select ts_lexize('nepali_stem', 'लेखछेस्'); ts_lexize --- {लेख} =# select ts_lexize('nepali_stem', 'लेखे'); ts_lexize --- {लेखे} Authors: - Oleg Bartunov - Nepali NLP Group Is it appropriate to add new snowball dictionaries? I'm not sure about policy of including new snowball dictionaries. -- Arthur Zakirov Postgres Professional: http://www.postgrespro.com Russian Postgres Company diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 610b7bf033..c9c7de52ad 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -3723,6 +3723,7 @@ Parser: "pg_catalog.default" pg_catalog | german_stem | snowball stemmer for german language pg_catalog | hungarian_stem | snowball stemmer for hungarian language pg_catalog | italian_stem| snowball stemmer for italian language + pg_catalog | nepali_stem | snowball stemmer for nepali language pg_catalog | norwegian_stem | snowball stemmer for norwegian language pg_catalog | portuguese_stem | snowball stemmer for portuguese language pg_catalog | romanian_stem | snowball stemmer for romanian language diff --git a/src/backend/snowball/Makefile b/src/backend/snowball/Makefile index 50cbace41d..c29f4184f2 100644 --- a/src/backend/snowball/Makefile +++ b/src/backend/snowball/Makefile @@ -40,6 +40,7 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \ stem_UTF_8_german.o \ stem_UTF_8_hungarian.o \ stem_UTF_8_italian.o \ + stem_UTF_8_nepali.o \ stem_UTF_8_norwegian.o \ stem_UTF_8_porter.o \ stem_UTF_8_portuguese.o \ @@ -62,6 +63,7 @@ LANGUAGES= \ german german \ hungarian hungarian \ italian italian \ + nepali nepali \ norwegian norwegian \ portuguese portuguese \ romanianromanian\ diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c index 78c9f73ef0..d96c849118 100644 --- a/src/backend/snowball/dict_snowball.c +++ b/src/backend/snowball/dict_snowball.c @@ -49,6 +49,7 @@ #include "snowball/libstemmer/stem_UTF_8_german.h" #include "snowball/libstemmer/stem_UTF_8_hungarian.h" #include "snowball/libstemmer/stem_UTF_8_italian.h" +#include "snowball/libstemmer/stem_UTF_8_nepali.h" #include "snowball/libstemmer/stem_UTF_8_norwegian.h" #include "snowball/libstemmer/stem_UTF_8_porter.h" #include "snowball/libstemmer/stem_UTF_8_portuguese.h" @@ -102,6 +103,7 @@ static const stemmer_module stemmer_modules[] = {"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"hungarian", PG_UTF8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, {"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, + {"nepali", PG_UTF8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, nepali_UTF_8_stem}, {"norwegian", PG_UTF8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, {"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}, {"portuguese", PG_UTF8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, diff --git a/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c b/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c new file mode 100644 index 00..f4f6e656ad --- /dev/null +++ b/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c @@ -0,0 +1,440 @@ +/* This file was generated automatically by the Snowball to ISO C compiler */ +/* http://snowballstem.org/ */ + +#include "header.h" + +#ifdef __cplusplus +extern "C" { +#endif +extern int nepali_UTF_8_stem(struct SN_env * z); +#ifdef __cplusplus +} +#endif +static int r_remove_category_2(struct SN_env * z); +static int r_remove_category_3(struct SN_env * z); +static int r_check_category_2(struct SN_env * z); +static int r_remove_category_1(struct SN_env * z); +#ifdef __cplusplus +extern "C" { +#endif + + +extern struct SN_env * nepali_UTF_8_create_env(void); +extern void nepali_UTF_8_close_env(struct SN_env * z); + + +#ifdef __cplusplus +} +#endif +static const symbol s_0_0[6] = { 0xE0, 0xA4, 0x95, 0xE0, 0xA5, 0x80 }; +static const symbol s_0_1[9] = { 0xE0, 0xA4, 0xB2, 0xE0, 0xA4, 0xBE, 0xE0, 0xA4, 0x87 }; +static const symbol s_0_2[6] = { 0xE0, 0xA4, 0xB2, 0xE0, 0xA5, 0x87 }; +static const symbol s_0_3[9] = { 0xE0, 0xA4, 0xB2, 0xE0, 0xA4, 0xBE, 0xE0, 0xA4, 0x88 }; +static const symbol s_0_4[6] = { 0xE0, 0xA4, 0x95, 0xE0, 0xA5, 0x88 }; +static const sym