I've implemented the bare minimal needed to get regexes working through
a ⎕RE function. I've attached the diff.

I really need Jürgen to take a look at this, since my code that constructs
the return value cannot possibly be correct. There must be a better way to
handle this which does not involve conversion back and forth between
std::string.

Also, I have the result in an UTF-8-encoded C string, and I try to create
an UTF8_string from it like this:

    Value_P field_value(UTF8_string(field.c_str()), LOC);

However, when I test this in APL I get the following result:

      '(..)..(..)$' ⎕RE 'sdklfjfj⍉'
┏→━━━━━━━━━━┓
┃"lf" "jâ\215\211"┃
┗∊━━━━━━━━━━┛

It seems the UTF-8 conversion is not done correctly by the UTF8_string
constructor. What did I do wrong?

Regards,
Elias

On 21 September 2017 at 11:38, Xiao-Yong Jin <jinxiaoy...@gmail.com> wrote:

>
> > On Sep 20, 2017, at 9:19 PM, Peter Teeson <peter.tee...@icloud.com>
> wrote:
> >
> > (These days performance can hardly be a compelling argument
> > with multiple many-core CPU chips.)
>
> This kind of argument for APL is exactly why Fortran is still alive and
> well.
>
>
Index: configure.ac
===================================================================
--- configure.ac        (revision 1011)
+++ configure.ac        (working copy)
@@ -162,6 +162,8 @@
 fi
 fi
 
+m4_include([m4/ax_path_lib_pcre.m4]) AX_PATH_LIB_PCRE([])
+
 # check if rdtsc (read CPU cycle counter is available.
 # This is expected only on Intel CPUs
 AC_MSG_CHECKING([whether CPU has rdtsc (read CPU cycle counter) opcode])
Index: m4/ax_path_lib_pcre.m4
===================================================================
--- m4/ax_path_lib_pcre.m4      (nonexistent)
+++ m4/ax_path_lib_pcre.m4      (working copy)
@@ -0,0 +1,90 @@
+# ===========================================================================
+#     https://www.gnu.org/software/autoconf-archive/ax_path_lib_pcre.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PATH_LIB_PCRE [(A/NA)]
+#
+# DESCRIPTION
+#
+#   check for pcre lib and set PCRE_LIBS and PCRE_CFLAGS accordingly.
+#
+#   also provide --with-pcre option that may point to the $prefix of the
+#   pcre installation - the macro will check $pcre/include and $pcre/lib to
+#   contain the necessary files.
+#
+#   the usual two ACTION-IF-FOUND / ACTION-IF-NOT-FOUND are supported and
+#   they can take advantage of the LIBS/CFLAGS additions.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Guido U. Draheim <gui...@gmx.de>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 3 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <https://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AC_DEFUN([AX_PATH_LIB_PCRE],[dnl
+AC_MSG_CHECKING([lib pcre])
+AC_ARG_WITH(pcre,
+[  --with-pcre[[=prefix]]    compile xmlpcre part (via libpcre check)],,
+     with_pcre="yes")
+if test ".$with_pcre" = ".no" ; then
+  AC_MSG_RESULT([disabled])
+  m4_ifval($2,$2)
+else
+  AC_MSG_RESULT([(testing)])
+  AC_CHECK_LIB(pcre, pcre_study)
+  if test "$ac_cv_lib_pcre_pcre_study" = "yes" ; then
+     PCRE_LIBS="-lpcre"
+     AC_MSG_CHECKING([lib pcre])
+     AC_MSG_RESULT([$PCRE_LIBS])
+     m4_ifval($1,$1)
+  else
+     OLDLDFLAGS="$LDFLAGS" ; LDFLAGS="$LDFLAGS -L$with_pcre/lib"
+     OLDCPPFLAGS="$CPPFLAGS" ; CPPFLAGS="$CPPFLAGS -I$with_pcre/include"
+     AC_CHECK_LIB(pcre, pcre_compile)
+     CPPFLAGS="$OLDCPPFLAGS"
+     LDFLAGS="$OLDLDFLAGS"
+     if test "$ac_cv_lib_pcre_pcre_compile" = "yes" ; then
+        AC_MSG_RESULT(.setting PCRE_LIBS -L$with_pcre/lib -lpcre)
+        PCRE_LIBS="-L$with_pcre/lib -lpcre"
+        test -d "$with_pcre/include" && PCRE_CFLAGS="-I$with_pcre/include"
+        AC_MSG_CHECKING([lib pcre])
+        AC_MSG_RESULT([$PCRE_LIBS])
+        m4_ifval($1,$1)
+     else
+        AC_MSG_CHECKING([lib pcre])
+        AC_MSG_RESULT([no, (WARNING)])
+        m4_ifval($2,$2)
+     fi
+  fi
+fi
+AC_SUBST([PCRE_LIBS])
+AC_SUBST([PCRE_CFLAGS])
+])
Index: src/Id.cc
===================================================================
--- src/Id.cc   (revision 1011)
+++ src/Id.cc   (working copy)
@@ -37,6 +37,7 @@
 #include "QuadFunction.hh"
 #include "Quad_DLX.hh"
 #include "Quad_FX.hh"
+#include "Quad_RE.hh"
 #include "Quad_SQL.hh"
 #include "Quad_SVx.hh"
 #include "Quad_TF.hh"
Index: src/Id.def
===================================================================
--- src/Id.def  (revision 1011)
+++ src/Id.def  (working copy)
@@ -201,6 +201,7 @@
 qf( SVS           , "竡百VS"    ,          )
 qv( SYL           , "竡百YL"    ,          )
 pp( USER_SYMBOL   , ---       ,          )
+qf( RE            , "竡紐E"     ,          )
 pp( STOP_LINE     , ---       ,          )
 qf( STOP          , "竡百TOP"   ,          )
 qf( SQL           , "竡百QL"    ,          )
Index: src/Makefile.am
===================================================================
--- src/Makefile.am     (revision 1011)
+++ src/Makefile.am     (working copy)
@@ -86,6 +86,7 @@
 Quad_DLX.cc                                    Quad_DLX.hh             \
 Quad_FIO.cc                                    Quad_FIO.hh             \
 Quad_FX.cc                                     Quad_FX.hh              \
+Quad_RE.cc                                      Quad_RE.hh              \
 Quad_RL.cc                                     Quad_RL.hh              \
 Quad_SQL.cc                                    Quad_SQL.hh             \
 Quad_SVx.cc                                    Quad_SVx.hh             \
Index: src/QuadFunction.cc
===================================================================
--- src/QuadFunction.cc (revision 1011)
+++ src/QuadFunction.cc (working copy)
@@ -36,6 +36,7 @@
 #include "PrintOperator.hh"
 #include "QuadFunction.hh"
 #include "Quad_FX.hh"
+#include "Quad_RE.hh"
 #include "Quad_SQL.hh"
 #include "Quad_TF.hh"
 #include "Tokenizer.hh"
Index: src/Quad_RE.cc
===================================================================
--- src/Quad_RE.cc      (nonexistent)
+++ src/Quad_RE.cc      (working copy)
@@ -0,0 +1,99 @@
+#include "Quad_RE.hh"
+#include "Workspace.hh"
+#include "PointerCell.hh"
+
+#include <pcre.h>
+
+Quad_RE Quad_RE::_fun;
+Quad_RE *Quad_RE::fun = &Quad_RE::_fun;
+
+Quad_RE::Quad_RE() : QuadFunction(TOK_Quad_RE)
+{
+}
+
+Token Quad_RE::eval_AB(Value_P A, Value_P B)
+{
+    if(!A->is_char_string()) {
+        MORE_ERROR() << "Regexp argument must be a string value";
+        VALUE_ERROR;
+    }
+
+    UTF8_string pattern(A->get_UCS_ravel());
+
+    const char *error_string;
+    int error_offset;
+
+    pcre *code = pcre_compile(pattern.c_str(), PCRE_UTF8, &error_string, 
&error_offset, NULL);
+    if(code == NULL) {
+        MORE_ERROR() << "Error compiling regex at offset: " << error_offset << 
": " << error_string;
+        SYNTAX_ERROR;
+    }
+
+    pcre_extra *extra = pcre_study(code, 0, &error_string);
+    if(error_string != NULL) {
+        MORE_ERROR() << "Error studying regex: " << error_string << "\n";
+        SYNTAX_ERROR;
+    }
+
+    int sub[256];
+
+    const Shape &shape = B->get_shape();
+    if(shape.get_rank() == 0) {
+        return Token(TOK_APL_VALUE1, Str0(LOC));
+    }
+    else if(B->is_char_string()) {
+        UTF8_string matched = B->get_UCS_ravel();
+        const char *matched_c = matched.c_str();
+        cout << "will match: " << matched << " against " << pattern << endl;
+        int match_result = pcre_exec(code, extra, matched_c, 
strlen(matched_c), 0, 0, sub, sizeof(sub) / sizeof(sub[0]));
+        cout << "n = " << match_result << endl;
+        if(match_result < 0) {
+            return Token(TOK_APL_VALUE1, Str0(LOC));
+        }
+
+        if(match_result == 1) {
+            // No subexpressions, return the entire matched string
+            string mc = matched_c;
+            UTF8_string result(mc.substr(sub[0], sub[1] - sub[0]).c_str());
+            Value_P result_value(result, LOC);
+            result_value->check_value(LOC);
+            return Token(TOK_APL_VALUE1, result_value);
+        }
+        else {
+            string mc = matched_c;
+            Shape shape(match_result - 1);
+            Value_P result_value(shape, LOC);
+            for(int i = 1 ; i < match_result ; i++) {
+                int start = sub[i * 2];
+                int end = sub[i * 2 + 1];
+                string field = mc.substr(start, end - start);
+                Value_P field_value(UTF8_string(field.c_str()), LOC);
+                field_value->check_value(LOC);
+                new (result_value->next_ravel()) PointerCell(field_value, 
result_value.getref());
+            }
+            result_value->check_value(LOC);
+            return Token(TOK_APL_VALUE1, result_value);
+        }
+    }
+    else {
+        VALUE_ERROR;
+    }
+}
+
+Token
+Quad_RE::eval_AXB(const Value_P A, const Value_P X, const Value_P B)
+{
+    return Token( TOK_APL_VALUE1, Str0( LOC ) );
+}
+
+Token
+Quad_RE::eval_B(Value_P B)
+{
+    return Token( TOK_APL_VALUE1, Str0( LOC ) );
+}
+
+Token
+Quad_RE::eval_XB(Value_P X, Value_P B)
+{
+    return Token( TOK_APL_VALUE1, Str0( LOC ) );
+}
Index: src/Quad_RE.hh
===================================================================
--- src/Quad_RE.hh      (nonexistent)
+++ src/Quad_RE.hh      (working copy)
@@ -0,0 +1,54 @@
+/*
+    This file is part of GNU APL, a free implementation of the
+    ISO/IEC Standard 13751, "Programming Language APL, Extended"
+
+    Copyright (C) 2008-2016  Dr. Jテシrgen Sauermann
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __Quad_SQL_RE_DEFINED__
+#define __Quad_SQL_RE_DEFINED__
+
+#include "QuadFunction.hh"
+#include "Value.hh"
+#include "Simple_string.hh"
+
+class Quad_RE : public QuadFunction
+{
+public:
+   /// Constructor.
+   Quad_RE();
+
+   static Quad_RE * fun;          ///< Built-in function.
+   static Quad_RE  _fun;          ///< Built-in function.
+
+protected:
+   /// overloaded Function::eval_AB().
+   Token eval_AB(const Value_P A, const Value_P B);
+
+   /// overloaded Function::eval_AXB().
+   Token eval_AXB(const Value_P A, const Value_P X, const Value_P B);
+
+   /// overloaded Function::eval_B().
+   Token eval_B(Value_P B);
+
+   /// overloaded Function::eval_XB().
+   Token eval_XB(Value_P X, Value_P B);
+
+// virtual Token eval_AB(Value_P A, Value_P B);
+
+};
+
+#endif
Index: src/SystemVariable.def
===================================================================
--- src/SystemVariable.def      (revision 1011)
+++ src/SystemVariable.def      (working copy)
@@ -73,6 +73,7 @@
   sf_def(Quad_NA,    "NA",    "Name Association"             )
   sf_def(Quad_NC,    "NC",    "Name Class"                   )
   sf_def(Quad_NL,    "NL",    "Name List"                    )
+  sf_def(Quad_RE,    "RE",    "Regular expression"           )
   sf_def(Quad_SI,    "SI",    "State Indicator"              )
   sf_def(Quad_SQL,   "SQL",   "SQL functions"                )
   sf_def(Quad_SVC,   "SVC",   "Shared Variable Control"      )
@@ -86,6 +87,3 @@
   sf_def(Quad_UCS,   "UCS",   "Universal Char Set (Unicode)" )
 # undef sf_def
 #endif
-
-
-
Index: src/Token.def
===================================================================
--- src/Token.def       (revision 1011)
+++ src/Token.def       (working copy)
@@ -116,6 +116,7 @@
 TD(TOK_Quad_EC       , TC_FUN1      , TV_FUN  , ID::Quad_EC      )
 TD(TOK_Quad_ENV      , TC_FUN1      , TV_FUN  , ID::Quad_ENV     )
 TD(TOK_Quad_EX       , TC_FUN1      , TV_FUN  , ID::Quad_EX      )
+TD(TOK_Quad_RE       , TC_FUN2      , TV_FUN  , ID::Quad_RE      )
 TD(TOK_Quad_SQL      , TC_FUN2      , TV_FUN  , ID::Quad_SQL     )
 TD(TOK_Quad_SVQ      , TC_FUN1      , TV_FUN  , ID::Quad_SVQ     )
 TD(TOK_Quad_SVR      , TC_FUN1      , TV_FUN  , ID::Quad_SVR     )
Index: src/Workspace.hh
===================================================================
--- src/Workspace.hh    (revision 1011)
+++ src/Workspace.hh    (working copy)
@@ -28,6 +28,7 @@
 #include "Quad_CR.hh"
 #include "Quad_DLX.hh"
 #include "Quad_FIO.hh"
+#include "Quad_RE.hh"
 #include "Quad_RL.hh"
 #include "Quad_SVx.hh"
 #include "ScalarFunction.hh"

Reply via email to