Re: [PROPOSAL] Nepali Snowball dictionary

2018-06-26 Thread Arthur Zakirov
Hello all,

On Wed, Feb 28, 2018 at 11:16:24AM +0300, Arthur Zakirov wrote:
> I've sent a pull request with nepali snowball algorithm into
> https://github.com/snowballstem [1]. They aren't againts the patch.
> 
> They haven't merged it yet, though. There are some problems with
> continuous testing via Travis CI which aren't related with the patch and
> require fix some scripts.

The patch with nepali stemmer was applied into the Snowball upstream
[1].

I attached second version of the patch. Nepali stemmer was generated by
latest Snowball compiler (commit
3c3d3953e8174b55e86aedd89544ea4e5d76db78).

I will add new commitfest entry.

Authors:
- Ingroj Shrestha, Nepali NLP Group
- Oleg Bartunov, Postgres Professional Ltd.
- Shreeya Singh Dhakal, Nepali NLP Group

1 - https://github.com/snowballstem/snowball/pull/70

-- 
Arthur Zakirov
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index 8075ea94e7..61280b03fc 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -3803,6 +3803,7 @@ Parser: "pg_catalog.default"
  pg_catalog | german_stem | snowball stemmer for german language
  pg_catalog | hungarian_stem  | snowball stemmer for hungarian language
  pg_catalog | italian_stem| snowball stemmer for italian language
+ pg_catalog | nepali_stem | snowball stemmer for nepali language
  pg_catalog | norwegian_stem  | snowball stemmer for norwegian language
  pg_catalog | portuguese_stem | snowball stemmer for portuguese language
  pg_catalog | romanian_stem   | snowball stemmer for romanian language
diff --git a/src/backend/snowball/Makefile b/src/backend/snowball/Makefile
index 50cbace41d..c29f4184f2 100644
--- a/src/backend/snowball/Makefile
+++ b/src/backend/snowball/Makefile
@@ -40,6 +40,7 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \
stem_UTF_8_german.o \
stem_UTF_8_hungarian.o \
stem_UTF_8_italian.o \
+   stem_UTF_8_nepali.o \
stem_UTF_8_norwegian.o \
stem_UTF_8_porter.o \
stem_UTF_8_portuguese.o \
@@ -62,6 +63,7 @@ LANGUAGES=  \
german  german  \
hungarian   hungarian   \
italian italian \
+   nepali  nepali  \
norwegian   norwegian   \
portuguese  portuguese  \
romanianromanian\
diff --git a/src/backend/snowball/dict_snowball.c 
b/src/backend/snowball/dict_snowball.c
index 78c9f73ef0..d96c849118 100644
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -49,6 +49,7 @@
 #include "snowball/libstemmer/stem_UTF_8_german.h"
 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
 #include "snowball/libstemmer/stem_UTF_8_italian.h"
+#include "snowball/libstemmer/stem_UTF_8_nepali.h"
 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
 #include "snowball/libstemmer/stem_UTF_8_porter.h"
 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
@@ -102,6 +103,7 @@ static const stemmer_module stemmer_modules[] =
{"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, 
german_UTF_8_stem},
{"hungarian", PG_UTF8, hungarian_UTF_8_create_env, 
hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
{"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, 
italian_UTF_8_stem},
+   {"nepali", PG_UTF8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, 
nepali_UTF_8_stem},
{"norwegian", PG_UTF8, norwegian_UTF_8_create_env, 
norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
{"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, 
porter_UTF_8_stem},
{"portuguese", PG_UTF8, portuguese_UTF_8_create_env, 
portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
diff --git a/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c 
b/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c
new file mode 100644
index 00..d1c1be76f3
--- /dev/null
+++ b/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c
@@ -0,0 +1,424 @@
+/* This file was generated automatically by the Snowball to ISO C compiler */
+/* http://snowballstem.org/ */
+
+#include "header.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int nepali_UTF_8_stem(struct SN_env * z);
+#ifdef __cplusplus
+}
+#endif
+static int r_remove_category_3(struct SN_env * z);
+static int r_remove_category_2(struct SN_env * z);
+static int r_check_category_2(struct SN_env * z);
+static int r_remove_category_1(struct SN_env * z);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+extern struct SN_env * nepali_UTF_8_create_env(void);
+extern void nepali_UTF_8_close_env(struct SN_env * z);
+
+
+#ifdef __cplusplus
+}
+#endif
+static const symbol s_0_0[6] = { 0xE0, 0xA4, 0x95, 0xE0, 0xA5, 0x80 };
+static const symbol s_0_1[9] = { 0xE0, 0xA4, 0xB2, 0xE0, 0xA4, 0xBE, 0xE0, 
0xA4, 0x87 };
+static const symbol s_0_2[6] = { 0xE0, 0xA4, 0xB2, 0xE0, 0x

Re: [PROPOSAL] Nepali Snowball dictionary

2018-03-02 Thread Arthur Zakirov
On Thu, Mar 01, 2018 at 10:23:11PM -0800, Andres Freund wrote:
> What is that entry for, if I may ask?  We need to wait for them to merge
> it, then sync the snowball code, including the nepali dictionary. This
> doesn't realistically seem doable for this commitfest. Therefore I think
> this should be marked as 'returned with feedback'.

I understand the point. I marked the patch as 'Returned with feedback'
by myself.

-- 
Arthur Zakirov
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company



Re: [PROPOSAL] Nepali Snowball dictionary

2018-03-01 Thread Andres Freund
Hi,

On 2018-02-28 11:16:24 +0300, Arthur Zakirov wrote:
> I've created the commitfest entry https://commitfest.postgresql.org/17/1569/

What is that entry for, if I may ask?  We need to wait for them to merge
it, then sync the snowball code, including the nepali dictionary. This
doesn't realistically seem doable for this commitfest. Therefore I think
this should be marked as 'returned with feedback'.

Greetings,

Andres Freund



Re: [PROPOSAL] Nepali Snowball dictionary

2018-02-28 Thread Arthur Zakirov
On Tue, Feb 20, 2018 at 12:01:30AM +0300, Arthur Zakirov wrote:
> > As best I know, the original list
> > http://lists.tartarus.org/mailman/listinfo/snowball-discuss
> > is moribund, but there's a fork at
> > http://snowballstem.org
> > that has at least some activity.
> 
> From the original list it seems that http://snowballstem.org is
> frozen. But development work continues at
> https://github.com/snowballstem by other people.
> I'll try to send them a pull request.

I've sent a pull request with nepali snowball algorithm into
https://github.com/snowballstem [1]. They aren't againts the patch.

They haven't merged it yet, though. There are some problems with
continuous testing via Travis CI which aren't related with the patch and
require fix some scripts.

I've created the commitfest entry https://commitfest.postgresql.org/17/1569/


1 - https://github.com/snowballstem/snowball/pull/69

-- 
Arthur Zakirov
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company



Re: [PROPOSAL] Nepali Snowball dictionary

2018-02-19 Thread Oleg Bartunov
On Tue, Feb 20, 2018 at 12:01 AM, Arthur Zakirov
 wrote:
> Thank you for your answer!
>
> 2018-02-19 18:43 GMT+03:00 Tom Lane :
>> We are not the upstream for the snowball stuff, and lack the expertise
>> to decide whether proposed changes are any good.  To get anything
>> changed there, you'd have to get it approved by the snowball group.
>>
>> As best I know, the original list
>> http://lists.tartarus.org/mailman/listinfo/snowball-discuss
>> is moribund, but there's a fork at
>> http://snowballstem.org
>> that has at least some activity.
>
> From the original list it seems that http://snowballstem.org is
> frozen. But development work continues at
> https://github.com/snowballstem by other people.
> I'll try to send them a pull request.
>
>> Probably the first step ought to involve syncing our copy with the
>> current state of that upstream, something that's not been done in a
>> very long time :-(
>
> I think I will try to sync snowball dictionaries with snowballstem.org
> algorithms, it may be useful. Or maybe it is better to sync with the
> github repository. I don't aware how they differ yet, though.

That may be dangerous !

>
> --
> Arthur Zakirov
> Postgres Professional: http://www.postgrespro.com
> Russian Postgres Company
>



Re: [PROPOSAL] Nepali Snowball dictionary

2018-02-19 Thread Arthur Zakirov
Thank you for your answer!

2018-02-19 18:43 GMT+03:00 Tom Lane :
> We are not the upstream for the snowball stuff, and lack the expertise
> to decide whether proposed changes are any good.  To get anything
> changed there, you'd have to get it approved by the snowball group.
>
> As best I know, the original list
> http://lists.tartarus.org/mailman/listinfo/snowball-discuss
> is moribund, but there's a fork at
> http://snowballstem.org
> that has at least some activity.

>From the original list it seems that http://snowballstem.org is
frozen. But development work continues at
https://github.com/snowballstem by other people.
I'll try to send them a pull request.

> Probably the first step ought to involve syncing our copy with the
> current state of that upstream, something that's not been done in a
> very long time :-(

I think I will try to sync snowball dictionaries with snowballstem.org
algorithms, it may be useful. Or maybe it is better to sync with the
github repository. I don't aware how they differ yet, though.

-- 
Arthur Zakirov
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company



Re: [PROPOSAL] Nepali Snowball dictionary

2018-02-19 Thread Tom Lane
Arthur Zakirov  writes:
> Is it appropriate to add new snowball dictionaries? I'm not sure about
> policy of including new snowball dictionaries.

We are not the upstream for the snowball stuff, and lack the expertise
to decide whether proposed changes are any good.  To get anything
changed there, you'd have to get it approved by the snowball group.

As best I know, the original list
http://lists.tartarus.org/mailman/listinfo/snowball-discuss
is moribund, but there's a fork at
http://snowballstem.org
that has at least some activity.

Probably the first step ought to involve syncing our copy with the
current state of that upstream, something that's not been done in a
very long time :-(

regards, tom lane



[PROPOSAL] Nepali Snowball dictionary

2018-02-19 Thread Arthur Zakirov
Hello hackers,

I would like to propose nepali snowball dictionary patch.

Nepali is inflectional and derivational language. And it can be stemmed.

initdb also patched, so it can determine default text search
configuration.

Examples:

=# select ts_lexize('nepali_stem', 'लेख्');
 ts_lexize 
---
 {लेख्}

=# select ts_lexize('nepali_stem', 'लेखछेस्');
 ts_lexize 
---
 {लेख}

=# select ts_lexize('nepali_stem', 'लेखे');
 ts_lexize 
---
 {लेखे}

Authors:
- Oleg Bartunov
- Nepali NLP Group

Is it appropriate to add new snowball dictionaries? I'm not sure about
policy of including new snowball dictionaries.

-- 
Arthur Zakirov
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index 610b7bf033..c9c7de52ad 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -3723,6 +3723,7 @@ Parser: "pg_catalog.default"
  pg_catalog | german_stem | snowball stemmer for german language
  pg_catalog | hungarian_stem  | snowball stemmer for hungarian language
  pg_catalog | italian_stem| snowball stemmer for italian language
+ pg_catalog | nepali_stem | snowball stemmer for nepali language
  pg_catalog | norwegian_stem  | snowball stemmer for norwegian language
  pg_catalog | portuguese_stem | snowball stemmer for portuguese language
  pg_catalog | romanian_stem   | snowball stemmer for romanian language
diff --git a/src/backend/snowball/Makefile b/src/backend/snowball/Makefile
index 50cbace41d..c29f4184f2 100644
--- a/src/backend/snowball/Makefile
+++ b/src/backend/snowball/Makefile
@@ -40,6 +40,7 @@ OBJS= $(WIN32RES) dict_snowball.o api.o utilities.o \
stem_UTF_8_german.o \
stem_UTF_8_hungarian.o \
stem_UTF_8_italian.o \
+   stem_UTF_8_nepali.o \
stem_UTF_8_norwegian.o \
stem_UTF_8_porter.o \
stem_UTF_8_portuguese.o \
@@ -62,6 +63,7 @@ LANGUAGES=  \
german  german  \
hungarian   hungarian   \
italian italian \
+   nepali  nepali  \
norwegian   norwegian   \
portuguese  portuguese  \
romanianromanian\
diff --git a/src/backend/snowball/dict_snowball.c 
b/src/backend/snowball/dict_snowball.c
index 78c9f73ef0..d96c849118 100644
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -49,6 +49,7 @@
 #include "snowball/libstemmer/stem_UTF_8_german.h"
 #include "snowball/libstemmer/stem_UTF_8_hungarian.h"
 #include "snowball/libstemmer/stem_UTF_8_italian.h"
+#include "snowball/libstemmer/stem_UTF_8_nepali.h"
 #include "snowball/libstemmer/stem_UTF_8_norwegian.h"
 #include "snowball/libstemmer/stem_UTF_8_porter.h"
 #include "snowball/libstemmer/stem_UTF_8_portuguese.h"
@@ -102,6 +103,7 @@ static const stemmer_module stemmer_modules[] =
{"german", PG_UTF8, german_UTF_8_create_env, german_UTF_8_close_env, 
german_UTF_8_stem},
{"hungarian", PG_UTF8, hungarian_UTF_8_create_env, 
hungarian_UTF_8_close_env, hungarian_UTF_8_stem},
{"italian", PG_UTF8, italian_UTF_8_create_env, italian_UTF_8_close_env, 
italian_UTF_8_stem},
+   {"nepali", PG_UTF8, nepali_UTF_8_create_env, nepali_UTF_8_close_env, 
nepali_UTF_8_stem},
{"norwegian", PG_UTF8, norwegian_UTF_8_create_env, 
norwegian_UTF_8_close_env, norwegian_UTF_8_stem},
{"porter", PG_UTF8, porter_UTF_8_create_env, porter_UTF_8_close_env, 
porter_UTF_8_stem},
{"portuguese", PG_UTF8, portuguese_UTF_8_create_env, 
portuguese_UTF_8_close_env, portuguese_UTF_8_stem},
diff --git a/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c 
b/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c
new file mode 100644
index 00..f4f6e656ad
--- /dev/null
+++ b/src/backend/snowball/libstemmer/stem_UTF_8_nepali.c
@@ -0,0 +1,440 @@
+/* This file was generated automatically by the Snowball to ISO C compiler */
+/* http://snowballstem.org/ */
+
+#include "header.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+extern int nepali_UTF_8_stem(struct SN_env * z);
+#ifdef __cplusplus
+}
+#endif
+static int r_remove_category_2(struct SN_env * z);
+static int r_remove_category_3(struct SN_env * z);
+static int r_check_category_2(struct SN_env * z);
+static int r_remove_category_1(struct SN_env * z);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+extern struct SN_env * nepali_UTF_8_create_env(void);
+extern void nepali_UTF_8_close_env(struct SN_env * z);
+
+
+#ifdef __cplusplus
+}
+#endif
+static const symbol s_0_0[6] = { 0xE0, 0xA4, 0x95, 0xE0, 0xA5, 0x80 };
+static const symbol s_0_1[9] = { 0xE0, 0xA4, 0xB2, 0xE0, 0xA4, 0xBE, 0xE0, 
0xA4, 0x87 };
+static const symbol s_0_2[6] = { 0xE0, 0xA4, 0xB2, 0xE0, 0xA5, 0x87 };
+static const symbol s_0_3[9] = { 0xE0, 0xA4, 0xB2, 0xE0, 0xA4, 0xBE, 0xE0, 
0xA4, 0x88 };
+static const symbol s_0_4[6] = { 0xE0, 0xA4, 0x95, 0xE0, 0xA5, 0x88 };
+static const sym