Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package ocaml-re for openSUSE:Factory checked in at 2023-09-15 22:05:31 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/ocaml-re (Old) and /work/SRC/openSUSE:Factory/.ocaml-re.new.1766 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "ocaml-re" Fri Sep 15 22:05:31 2023 rev:8 rq:1111517 version:1.11.0 Changes: -------- --- /work/SRC/openSUSE:Factory/ocaml-re/ocaml-re.changes 2022-05-05 23:08:01.553679572 +0200 +++ /work/SRC/openSUSE:Factory/.ocaml-re.new.1766/ocaml-re.changes 2023-09-15 22:10:57.348892739 +0200 @@ -1,0 +2,6 @@ +Sat Sep 9 09:09:09 UTC 2023 - oher...@suse.de + +- Update to version 1.11.0 + see included CHANGES.md for details + +------------------------------------------------------------------- Old: ---- ocaml-re-1.10.4.tar.xz New: ---- ocaml-re-1.11.0.tar.xz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ ocaml-re.spec ++++++ --- /var/tmp/diff_new_pack.GBZJBi/_old 2023-09-15 22:10:59.484969113 +0200 +++ /var/tmp/diff_new_pack.GBZJBi/_new 2023-09-15 22:10:59.496969542 +0200 @@ -1,7 +1,7 @@ # # spec file for package ocaml-re # -# Copyright (c) 2022 SUSE LLC +# Copyright (c) 2023 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -25,7 +25,7 @@ %define pkg ocaml-re Name: %pkg%nsuffix -Version: 1.10.4 +Version: 1.11.0 Release: 0 %{?ocaml_preserve_bytecode} Summary: Pure OCaml regular expressions @@ -35,7 +35,7 @@ Source0: %pkg-%version.tar.xz BuildRequires: ocaml BuildRequires: ocaml-dune >= 2.0 -BuildRequires: ocaml-rpm-macros >= 20220409 +BuildRequires: ocaml-rpm-macros >= 20230101 %if 1 BuildRequires: ocamlfind(seq) BuildRequires: ocamlfind(str) ++++++ _service ++++++ --- /var/tmp/diff_new_pack.GBZJBi/_old 2023-09-15 22:10:59.896983845 +0200 +++ /var/tmp/diff_new_pack.GBZJBi/_new 2023-09-15 22:10:59.940985418 +0200 @@ -1,7 +1,7 @@ <services> <service name="tar_scm" mode="disabled"> <param name="filename">ocaml-re</param> - <param name="revision">e9a4cecb8294c1839db18b1d0c30e755ec85ed5e</param> + <param name="revision">2dd38515c76c40299596d39f18d9b9a20f00d788</param> <param name="scm">git</param> <param name="submodules">disable</param> <param name="url">https://github.com/ocaml/ocaml-re.git</param> ++++++ ocaml-re-1.10.4.tar.xz -> ocaml-re-1.11.0.tar.xz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/.github/workflows/gh-pages.yml new/ocaml-re-1.11.0/.github/workflows/gh-pages.yml --- old/ocaml-re-1.10.4/.github/workflows/gh-pages.yml 1970-01-01 01:00:00.000000000 +0100 +++ new/ocaml-re-1.11.0/.github/workflows/gh-pages.yml 2023-08-19 12:51:09.000000000 +0200 @@ -0,0 +1,39 @@ +name: github pages + +on: + push: + branches: + - master + +jobs: + deploy: + name: Deploy doc + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@main + + - name: Use OCaml + uses: ocaml/setup-ocaml@v2 + with: + ocaml-compiler: '4.14.x' + + - name: Pin + run: opam pin -n . + + - name: Depext + run: opam depext -yt re + + - name: Deps + run: opam install -d . --deps-only + run: opam install odoc + + - name: Build + run: opam exec -- dune build @doc + + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./_build/default/_doc/_html/ + destination_dir: . + enable_jekyll: true diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/.github/workflows/main.yml new/ocaml-re-1.11.0/.github/workflows/main.yml --- old/ocaml-re-1.10.4/.github/workflows/main.yml 1970-01-01 01:00:00.000000000 +0100 +++ new/ocaml-re-1.11.0/.github/workflows/main.yml 2023-08-19 12:51:09.000000000 +0200 @@ -0,0 +1,32 @@ +name: build +on: + push: + branches: + - master + pull_request: + branches: + - master +jobs: + run: + name: Build + strategy: + matrix: + os: + - ubuntu-latest + #- macos-latest + #- windows-latest + ocaml-compiler: + - 4.08.x + - 4.14.x + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v2 + - uses: ocaml/setup-ocaml@v2 + with: + ocaml-compiler: ${{ matrix.ocaml-compiler }} + - run: opam pin -n . + - run: opam depext -yt re + - run: opam install -t . --deps-only + - run: opam install -y core_bench core_unix + - run: opam exec -- dune build + - run: opam exec -- dune runtest diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/.vscode/settings.json new/ocaml-re-1.11.0/.vscode/settings.json --- old/ocaml-re-1.10.4/.vscode/settings.json 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/.vscode/settings.json 2023-08-19 12:51:09.000000000 +0200 @@ -1,6 +1,6 @@ { "ocaml.sandbox": { "kind": "opam", - "switch": "4.12.0" + "switch": "4.14.1" } -} \ No newline at end of file +} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/CHANGES.md new/ocaml-re-1.11.0/CHANGES.md --- old/ocaml-re-1.10.4/CHANGES.md 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/CHANGES.md 2023-08-19 12:51:09.000000000 +0200 @@ -1,3 +1,15 @@ +1.11.0 (19-Aug-2023) +-------------------- + +* Add `Re.group_count` to get the number of groups in a compiled regex (#218) +* Add `Re.exec_partial_detailed` to allow resuming searches from partial inputs + (#219) +* Re-export `Re.Perl`'s `Parse_error` and `Not_supported` exceptions + in Pcre (#222) +* Add support for `DOTALL` flag in `Re.Pcre.regexp` (#225) +* Add support for named groups (#223) +* Add support for some control characters in `Re.Perl` (#227) + 1.10.4 (27-Apr-2022) -------------------- diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/README.md new/ocaml-re-1.11.0/README.md --- old/ocaml-re-1.10.4/README.md 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/README.md 2023-08-19 12:51:09.000000000 +0200 @@ -2,7 +2,7 @@ =========== Re is a regular expression library for OCaml. -[](https://travis-ci.org/ocaml/ocaml-re) +[](https://github.com/ocaml/ocaml-re/actions/workflows/main.yml) Contact ======= @@ -29,7 +29,7 @@ look-ahead/look-behind **assertions**. There is also a subset of the PCRE interface available in the `Re.Pcre` module. -This makes it easier to port code from that library to Re minimal changes. +This makes it easier to port code from that library to Re with minimal changes. Performances ============ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/benchmarks/benchmark.ml new/ocaml-re-1.11.0/benchmarks/benchmark.ml --- old/ocaml-re-1.10.4/benchmarks/benchmark.ml 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/benchmarks/benchmark.ml 2023-08-19 12:51:09.000000000 +0200 @@ -163,4 +163,4 @@ |> Bench.Test.create_group ~name:"tex gitignore" ] @ [http_benches] -let () = Command.run (Bench.make_command benchmarks) +let () = Command_unix.run (Bench.make_command benchmarks) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/benchmarks/dune new/ocaml-re-1.11.0/benchmarks/dune --- old/ocaml-re-1.10.4/benchmarks/dune 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/benchmarks/dune 2023-08-19 12:51:09.000000000 +0200 @@ -1,3 +1,3 @@ (executable - (libraries re threads core_bench) + (libraries re threads core_bench core_unix.command_unix) (name benchmark)) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/color_map.ml new/ocaml-re-1.11.0/lib/color_map.ml --- old/ocaml-re-1.10.4/lib/color_map.ml 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/color_map.ml 2023-08-19 12:51:09.000000000 +0200 @@ -24,7 +24,7 @@ Bytes.set c i (Char.chr !v); Bytes.set color_repr !v (Char.chr i) done; - (c, Bytes.sub color_repr 0 (!v + 1), !v + 1) + (Bytes.unsafe_to_string c, Bytes.sub_string color_repr 0 (!v + 1), !v + 1) (* mark all the endpoints of the intervals of the char set with the 1 byte *) let split s cm = diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/color_map.mli new/ocaml-re-1.11.0/lib/color_map.mli --- old/ocaml-re-1.10.4/lib/color_map.mli 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/color_map.mli 2023-08-19 12:51:09.000000000 +0200 @@ -9,6 +9,6 @@ val make : unit -> t -val flatten : t -> bytes * bytes * int +val flatten : t -> string * string * int val split : Cset.t -> t -> unit diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/core.ml new/ocaml-re-1.11.0/lib/core.ml --- old/ocaml-re-1.10.4/lib/core.ml 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/core.ml 2023-08-19 12:51:09.000000000 +0200 @@ -30,7 +30,7 @@ type match_info = | Match of Group.t | Failed - | Running + | Running of { no_match_starts_before : int } type state = { idx : int; @@ -59,9 +59,9 @@ (* The whole regular expression *) mutable initial_states : (Category.t * state) list; (* Initial states, indexed by initial category *) - colors : Bytes.t; + colors : string; (* Color table *) - color_repr : Bytes.t; + color_repr : string; (* Table from colors to one character of this color *) ncolor : int; (* Number of colors. *) @@ -72,6 +72,8 @@ when computing a new state *) states : state Automata.State.Table.t; (* States of the deterministic automata *) + group_names : (string * int) list; + (* Named groups in the regular expression *) group_count : int (* Number of groups in the regular expression *) } @@ -79,11 +81,15 @@ let print_re = pp_re +let group_count re = re.group_count + +let group_names re = re.group_names + (* Information used during matching *) type info = { re : re; (* The automata *) - colors : Bytes.t; + colors : string; (* Color table ([x.colors = x.re.colors]) Shortcut used for performance reasons *) mutable positions : int array; @@ -104,7 +110,7 @@ else if color = re.lnl then Category.(lastnewline ++ newline ++ not_letter) else - Category.from_char (Bytes.get re.color_repr color) + Category.from_char (re.color_repr.[color]) (****) @@ -149,64 +155,32 @@ desc let validate info (s:string) ~pos st = - let color = Char.code (Bytes.get info.colors (Char.code s.[pos])) in + let color = Char.code (info.colors.[Char.code s.[pos]]) in let cat = category info.re ~color in let desc' = delta info cat ~color st in let st' = find_state info.re desc' in st.next.(color) <- st' -(* -let rec loop info s pos st = +let rec loop info s ~pos st = if pos < info.last then - let st' = st.next.(Char.code info.cols.[Char.code s.[pos]]) in + let st' = st.next.(Char.code info.colors.[Char.code s.[pos]]) in let idx = st'.idx in if idx >= 0 then begin info.positions.(idx) <- pos; - loop info s (pos + 1) st' + loop info s ~pos:(pos + 1) st' end else if idx = break then begin info.positions.(st'.real_idx) <- pos; st' end else begin (* Unknown *) - validate info s pos st; - loop info s pos st + validate info s ~pos st; + loop info s ~pos st end else st -*) - -let rec loop info (s:string) ~pos st = - if pos < info.last then - let st' = st.next.(Char.code (Bytes.get info.colors (Char.code s.[pos]))) in - loop2 info s ~pos st st' - else - st - -and loop2 info s ~pos st st' = - if st'.idx >= 0 then begin - let pos = pos + 1 in - if pos < info.last then begin - (* It is important to place these reads before the write *) - (* But then, we don't have enough registers left to store the - right position. So, we store the position plus one. *) - let st'' = - st'.next.(Char.code (Bytes.get info.colors (Char.code s.[pos]))) in - info.positions.(st'.idx) <- pos; - loop2 info s ~pos st' st'' - end else begin - info.positions.(st'.idx) <- pos; - st' - end - end else if st'.idx = break then begin - info.positions.(st'.real_idx) <- pos + 1; - st' - end else begin (* Unknown *) - validate info s ~pos st; - loop info s ~pos st - end let rec loop_no_mark info s ~pos ~last st = if pos < last then - let st' = st.next.(Char.code (Bytes.get info.colors (Char.code s.[pos]))) in + let st' = st.next.(Char.code info.colors.[Char.code s.[pos]]) in if st'.idx >= 0 then loop_no_mark info s ~pos:(pos + 1) ~last st' else if st'.idx = break then @@ -246,19 +220,19 @@ (* Special case for the last newline *) re.lnl else - Char.code (Bytes.get re.colors (Char.code s.[pos])) + Char.code re.colors.[Char.code s.[pos]] let rec handle_last_newline info ~pos st ~groups = let st' = st.next.(info.re.lnl) in if st'.idx >= 0 then begin - if groups then info.positions.(st'.idx) <- pos + 1; + if groups then info.positions.(st'.idx) <- pos; st' end else if st'.idx = break then begin - if groups then info.positions.(st'.real_idx) <- pos + 1; + if groups then info.positions.(st'.real_idx) <- pos; st' end else begin (* Unknown *) let color = info.re.lnl in - let real_c = Char.code (Bytes.get info.colors (Char.code '\n')) in + let real_c = Char.code info.colors.[Char.code '\n'] in let cat = category info.re ~color in let desc' = delta info cat ~color:real_c st in let st' = find_state info.re desc' in @@ -285,6 +259,25 @@ else loop_no_mark info s ~pos ~last initial_state +(* This function adds a final boundary check on the input. + This is useful to indicate that the output failed because + of insufficient input, or to verify that the output actually + matches for regex that have boundary conditions with respect + to the input string. + *) +let final_boundary_check ~last ~slen re s ~info ~st ~groups = + let final_cat = + if last = slen then + Category.(search_boundary ++ inexistant) + else + Category.(search_boundary ++ category re ~color:(get_color re s last)) + in + let (idx, res) = final info st final_cat in + (match groups, res with + | true, Match _ -> info.positions.(idx) <- last + | _ -> ()); + res + let match_str ~groups ~partial re s ~pos ~len = let slen = String.length s in let last = if len = -1 then slen else pos + len in @@ -310,26 +303,32 @@ let initial_state = find_initial_state re initial_cat in let st = scan_str info s initial_state ~groups in let res = - if st.idx = break || partial then + if st.idx = break || (partial && not groups) then Automata.status st.desc - else - let final_cat = - if last = slen then - Category.(search_boundary ++ inexistant) - else - Category.(search_boundary ++ category re ~color:(get_color re s last)) - in - let (idx, res) = final info st final_cat in - if groups then info.positions.(idx) <- last + 1; - res + else if partial && groups then + match Automata.status st.desc with + | Match _ | Failed as status -> status + | Running -> + (* This could be because it's still not fully matched, or it + could be that because we need to run special end of input + checks. *) + (match final_boundary_check ~last ~slen re s ~info ~st ~groups with + | Match _ as status -> status + | Failed | Running -> + (* A failure here just means that we need more data, i.e. + it's a partial match. *) + Running) + else final_boundary_check ~last ~slen re s ~info ~st ~groups in match res with Automata.Match (marks, pmarks) -> Match { s ; marks; pmarks ; gpos = info.positions; gcount = re.group_count} | Automata.Failed -> Failed - | Automata.Running -> Running + | Automata.Running -> + let no_match_starts_before = if groups then info.positions.(0) else 0 in + Running { no_match_starts_before } -let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_count = +let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_names ~group_count = { initial ; initial_states = []; colors; @@ -338,6 +337,7 @@ lnl; tbl = Automata.create_working_area (); states = Automata.State.Table.create 97; + group_names; group_count } (**** Character sets ****) @@ -347,7 +347,7 @@ let trans_set cache cm s = match Cset.one_char s with - | Some i -> Cset.csingle (Bytes.get cm i) + | Some i -> Cset.csingle cm.[i] | None -> let v = (Cset.hash_rec s, s) in try @@ -356,8 +356,7 @@ let l = Cset.fold_right s - ~f:(fun (i, j) l -> Cset.union (cseq (Bytes.get cm i) - (Bytes.get cm j)) l) + ~f:(fun (i, j) l -> Cset.union (cseq cm.[i] cm.[j]) l) ~init:Cset.empty in cache := Cset.CSetMap.add v l !cache; @@ -376,7 +375,7 @@ | Last_end_of_line | Start | Stop | Sem of Automata.sem * regexp | Sem_greedy of Automata.rep_kind * regexp - | Group of regexp | No_group of regexp | Nest of regexp + | Group of string option * regexp | No_group of regexp | Nest of regexp | Case of regexp | No_case of regexp | Intersection of regexp list | Complement of regexp list @@ -395,7 +394,7 @@ | Last_end_of_line | Start | Stop | Sem of Automata.sem * regexp | Sem_greedy of Automata.rep_kind * regexp - | Group of regexp | No_group of regexp | Nest of regexp + | Group of string option * regexp | No_group of regexp | Nest of regexp | Case of regexp | No_case of regexp | Intersection of regexp list | Complement of regexp list @@ -430,7 +429,8 @@ sexp fmt "Sem" (pair Automata.pp_sem pp) (sem, re) | Sem_greedy (k, re) -> sexp fmt "Sem_greedy" (pair Automata.pp_rep_kind pp) (k, re) - | Group c -> var "Group" c + | Group (None, c) -> var "Group" c + | Group (Some n, c) -> sexp fmt "Named_group" (pair str pp) (n, c) | No_group c -> var "No_group" c | Nest c -> var "Nest" c | Case c -> var "Case" c @@ -485,7 +485,7 @@ | Last_end_of_line -> lnl := true | Sem (_, r) | Sem_greedy (_, r) - | Group r | No_group r + | Group (_, r) | No_group r | Nest r | Pmark (_,r) -> colorize r | Case _ | No_case _ | Intersection _ @@ -580,16 +580,16 @@ | _ -> cr (* XXX should probably compute a category mask *) -let rec translate ids kind ign_group ign_case greedy pos cache c = function +let rec translate ids kind ign_group ign_case greedy pos names cache c = function | Set s -> (A.cst ids (trans_set cache c s), kind) | Sequence l -> - (trans_seq ids kind ign_group ign_case greedy pos cache c l, kind) + (trans_seq ids kind ign_group ign_case greedy pos names cache c l, kind) | Alternative l -> begin match merge_sequences l with [r'] -> let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' in + translate ids kind ign_group ign_case greedy pos names cache c r' in (enforce_kind ids kind kind' cr, kind) | merged_sequences -> (A.alt ids @@ -597,14 +597,14 @@ (fun r' -> let (cr, kind') = translate ids kind ign_group ign_case greedy - pos cache c r' in + pos names cache c r' in enforce_kind ids kind kind' cr) merged_sequences), kind) end | Repeat (r', i, j) -> let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' in + translate ids kind ign_group ign_case greedy pos names cache c r' in let rem = match j with None -> @@ -658,28 +658,33 @@ (A.before ids Category.search_boundary, kind) | Sem (kind', r') -> let (cr, kind'') = - translate ids kind' ign_group ign_case greedy pos cache c r' in + translate ids kind' ign_group ign_case greedy pos names cache c r' in (enforce_kind ids kind' kind'' cr, kind') | Sem_greedy (greedy', r') -> - translate ids kind ign_group ign_case greedy' pos cache c r' - | Group r' -> + translate ids kind ign_group ign_case greedy' pos names cache c r' + | Group (n, r') -> if ign_group then - translate ids kind ign_group ign_case greedy pos cache c r' + translate ids kind ign_group ign_case greedy pos names cache c r' else let p = !pos in + let () = + match n with + | Some name -> names := (name, p / 2) :: !names + | None -> () + in pos := !pos + 2; let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' in + translate ids kind ign_group ign_case greedy pos names cache c r' in (A.seq ids `First (A.mark ids p) ( A.seq ids `First cr (A.mark ids (p + 1))), kind') | No_group r' -> - translate ids kind true ign_case greedy pos cache c r' + translate ids kind true ign_case greedy pos names cache c r' | Nest r' -> let b = !pos in let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' + translate ids kind ign_group ign_case greedy pos names cache c r' in let e = !pos - 1 in if e < b then @@ -690,21 +695,21 @@ assert false | Pmark (i, r') -> let (cr, kind') = - translate ids kind ign_group ign_case greedy pos cache c r' in + translate ids kind ign_group ign_case greedy pos names cache c r' in (A.seq ids `First (A.pmark ids i) cr, kind') -and trans_seq ids kind ign_group ign_case greedy pos cache c = function +and trans_seq ids kind ign_group ign_case greedy pos names cache c = function | [] -> A.eps ids | [r] -> let (cr', kind') = - translate ids kind ign_group ign_case greedy pos cache c r in + translate ids kind ign_group ign_case greedy pos names cache c r in enforce_kind ids kind kind' cr' | r :: rem -> let (cr', kind') = - translate ids kind ign_group ign_case greedy pos cache c r in + translate ids kind ign_group ign_case greedy pos names cache c r in let cr'' = - trans_seq ids kind ign_group ign_case greedy pos cache c rem in + trans_seq ids kind ign_group ign_case greedy pos names cache c rem in if A.is_eps cr'' then cr' else if A.is_eps cr' then @@ -747,8 +752,8 @@ | Sem_greedy (k, r) -> let r' = handle_case ign_case r in if is_charset r' then r' else Sem_greedy (k, r') - | Group r -> - Group (handle_case ign_case r) + | Group (n, r) -> + Group (n, handle_case ign_case r) | No_group r -> let r' = handle_case ign_case r in if is_charset r' then r' else No_group r' @@ -783,12 +788,13 @@ let ncolor = if need_lnl then ncolor + 1 else ncolor in let ids = A.create_ids () in let pos = ref 0 in + let names = ref [] in let (r, kind) = translate ids - `First false false `Greedy pos (ref Cset.CSetMap.empty) colors regexp in + `First false false `Greedy pos names (ref Cset.CSetMap.empty) colors regexp in let r = enforce_kind ids `First kind r in (*Format.eprintf "<%d %d>@." !ids ncol;*) - mk_re ~initial:r ~colors ~color_repr ~ncolor ~lnl ~group_count:(!pos / 2) + mk_re ~initial:r ~colors ~color_repr ~ncolor ~lnl ~group_names:(List.rev !names) ~group_count:(!pos / 2) (****) @@ -805,7 +811,7 @@ false | Beg_of_str | Start -> true - | Sem (_, r) | Sem_greedy (_, r) | Group r | No_group r | Nest r + | Sem (_, r) | Sem_greedy (_, r) | Group (_, r) | No_group r | Nest r | Case r | No_case r | Pmark (_, r) -> anchored r @@ -857,7 +863,7 @@ let first r = Sem (`First, r) let greedy r = Sem_greedy (`Greedy, r) let non_greedy r = Sem_greedy (`Non_greedy, r) -let group r = Group r +let group ?name r = Group (name, r) let no_group r = No_group r let nest r = Nest r let mark r = let i = Pmark.gen () in (i,Pmark (i,r)) @@ -951,7 +957,14 @@ match exec_internal ~groups:false ~partial:true "Re.exec_partial" ?pos ?len re s with Match _ -> `Full - | Running -> `Partial + | Running _ -> `Partial + | Failed -> `Mismatch + +let exec_partial_detailed ?pos ?len re s = + match exec_internal ~groups:true ~partial:true "Re.exec_partial_detailed" + ?pos ?len re s with + Match group -> `Full group + | Running { no_match_starts_before } -> `Partial no_match_starts_before | Failed -> `Mismatch module Mark = struct @@ -999,7 +1012,7 @@ let p1, p2 = Group.offset substr 0 in let pos = if p1=p2 then p2+1 else p2 in Seq.Cons (substr, aux pos) - | Running + | Running _ | Failed -> Seq.Nil in aux pos @@ -1040,7 +1053,7 @@ let state = `Yield (`Delim substr) in Seq.Cons (`Text text, aux state i pos) ) else Seq.Cons (`Delim substr, aux state i pos) - | Running -> Seq.Nil + | Running _ -> Seq.Nil | Failed -> if i < limit then ( @@ -1131,7 +1144,7 @@ p2) else Buffer.add_substring buf s p2 (limit-p2) - | Running -> () + | Running _ -> () | Failed -> Buffer.add_substring buf s pos (limit-pos) in @@ -1158,7 +1171,7 @@ | Intersection _ | Complement _ | Difference (_, _) -> assert false - | Group r + | Group (_, r) | No_group r | Nest r | Sem (_, r) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/core.mli new/ocaml-re-1.11.0/lib/core.mli --- old/ocaml-re-1.10.4/lib/core.mli 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/core.mli 2023-08-19 12:51:09.000000000 +0200 @@ -74,6 +74,13 @@ (** Compile a regular expression into an executable version that can be used to match strings, e.g. with {!exec}. *) +val group_count : re -> int +(** Return the number of capture groups (including the one + corresponding to the entire regexp). *) + +val group_names : re -> (string * int) list +(** Return named capture groups with their index. *) + val exec : ?pos:int -> (** Default: 0 *) ?len:int -> (** Default: -1 (until end of string) *) @@ -155,7 +162,7 @@ ?pos:int -> (** Default: 0 *) ?len:int -> (** Default: -1 (until end of string) *) re -> string -> [ `Full | `Partial | `Mismatch ] -(** More detailed version of {!exec_p}. [`Full] is equivalent to [true], +(** More detailed version of {!execp}. [`Full] is equivalent to [true], while [`Mismatch] and [`Partial] are equivalent to [false], but [`Partial] indicates the input string could be extended to create a match. @@ -178,6 +185,18 @@ ]} *) +val exec_partial_detailed : + ?pos:int -> (** Default: 0 *) + ?len:int -> (** Default: -1 (until end of string) *) + re -> string -> [ `Full of Group.t | `Partial of int | `Mismatch ] +(** More detailed version of {!exec_opt}. [`Full group] is equivalent to [Some group], + while [`Mismatch] and [`Partial _] are equivalent to [None], but [`Partial position] + indicates that the input string could be extended to create a match, and no match could + start in the input string before the given position. + This could be used to not have to search the entirety of the input if more + becomes available, and use the given position as the [?pos] argument. +*) + (** Marks *) module Mark : sig @@ -206,7 +225,20 @@ val all : ?pos:int -> ?len:int -> re -> string -> Group.t list (** Repeatedly calls {!exec} on the given string, starting at given position and - length.*) + length. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = <abstr> + + # Re.all regex "my head, my shoulders, my knees, my toes ...";; + - : Re.substrings list = [<abstr>; <abstr>; <abstr>; <abstr>] + + # Re.all regex "My head, My shoulders, My knees, My toes ...";; + - : Re.substrings list = [] + ]} +*) type 'a gen = unit -> 'a option @@ -220,7 +252,26 @@ val matches : ?pos:int -> ?len:int -> re -> string -> string list (** Same as {!all}, but extracts the matched substring rather than returning - the whole group. This basically iterates over matched strings *) + the whole group. This basically iterates over matched strings. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = <abstr> + + # Re.matches regex "my head, my shoulders, my knees, my toes ...";; + - : string list = ["my head"; "my shoulders"; "my knees"; "my toes"] + + # Re.matches regex "My head, My shoulders, My knees, My toes ...";; + - : string list = [] + + # Re.matches regex "my my my my head my 1 toe my ...";; + - : string list = ["my my"; "my my"] + + # Re.matches ~pos:2 regex "my my my my head my +1 toe my ...";; + - : string list = ["my my"; "my head"] + ]} +*) val matches_gen : ?pos:int -> ?len:int -> re -> string -> string gen [@@ocaml.deprecated "Use Seq.matches"] @@ -295,27 +346,65 @@ ?pos:int -> (** Default: 0 *) ?len:int -> re -> string -> Group.t Seq.t - (** Same as {!all} but returns an iterator + (** Same as {!module-Re.val-all} but returns an iterator. + + {5 Examples:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = <abstr> + + # Re.Seq.all regex "my head, my shoulders, my knees, my toes ...";; + - : Re.substrings Seq.t = <fun> + ]} @since 1.10.0 *) val matches : ?pos:int -> (** Default: 0 *) ?len:int -> re -> string -> string Seq.t - (** Same as {!matches}, but returns an iterator + (** Same as {!module-Re.val-matches}, but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile Re.(seq [str "my"; blank; word(rep alpha)]);; + val regex : re = <abstr> + + # Re.Seq.matches regex "my head, my shoulders, my knees, my toes ...";; + - : string Seq.t = <fun> + ]} @since 1.10.0 *) val split : ?pos:int -> (** Default: 0 *) ?len:int -> re -> string -> string Seq.t - (** @since 1.10.0 *) + (** Same as {!module-Re.val-split} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = <abstr> + + # Re.Seq.split regex "Re,Ocaml,Jerome Vouillon";; + - : string Seq.t = <fun> + ]} + @since 1.10.0 *) val split_full : ?pos:int -> (** Default: 0 *) ?len:int -> re -> string -> split_token Seq.t - (** @since 1.10.0 *) + (** Same as {!module-Re.val-split_full} but returns an iterator. + + {5 Example:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = <abstr> + + # Re.Seq.split_full regex "Re,Ocaml,Jerome Vouillon";; + - : Re__Core.split_token Seq.t = <fun> + ]} + @since 1.10.0 *) end val replace : @@ -340,7 +429,20 @@ string (** [replace_string ~all re ~by s] iterates on [s], and replaces every occurrence of [re] with [by]. If [all = false], then only the first - occurrence of [re] is replaced. *) + occurrence of [re] is replaced. + + {5 Examples:} + {[ + # let regex = Re.compile (Re.char ',');; + val regex : re = <abstr> + + # Re.replace_string regex ~by:";" "[1,2,3,4,5,6,7]";; + - : string = "[1;2;3;4;5;6;7]" + + # Re.replace_string regex ~all:false ~by:";" "[1,2,3,4,5,6,7]";; + - : string = "[1;2,3,4,5,6,7]" + ]} +*) (** {2 String expressions (literal match)} *) @@ -482,7 +584,7 @@ (** {2 Groups (or submatches)} *) -val group : t -> t +val group : ?name:string -> t -> t (** Delimit a group. The group is considered as matching if it is used at least once (it may be used multiple times if is nested inside {!rep} for instance). If it is used multiple times, the last match is what gets @@ -589,7 +691,7 @@ | Last_end_of_line | Start | Stop | Sem of Automata.sem * outer | Sem_greedy of Automata.rep_kind * outer - | Group of outer | No_group of outer | Nest of outer + | Group of string option * outer | No_group of outer | Nest of outer | Case of outer | No_case of outer | Intersection of outer list | Complement of outer list diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/group.ml new/ocaml-re-1.11.0/lib/group.ml --- old/ocaml-re-1.10.4/lib/group.ml 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/group.ml 2023-08-19 12:51:09.000000000 +0200 @@ -11,8 +11,8 @@ if 2 * i + 1 >= Array.length t.marks then raise Not_found; let m1 = t.marks.(2 * i) in if m1 = -1 then raise Not_found; - let p1 = t.gpos.(m1) - 1 in - let p2 = t.gpos.(t.marks.(2 * i + 1)) - 1 in + let p1 = t.gpos.(m1) in + let p2 = t.gpos.(t.marks.(2 * i + 1)) in (p1, p2) let get t i = @@ -44,7 +44,7 @@ if m1 <> -1 then begin let p1 = t.gpos.(m1) in let p2 = t.gpos.(t.marks.(2 * i + 1)) in - res.(i) <- (p1 - 1, p2 - 1) + res.(i) <- (p1, p2) end done; res @@ -58,7 +58,7 @@ if m1 <> -1 then begin let p1 = t.gpos.(m1) in let p2 = t.gpos.(t.marks.(2 * i + 1)) in - res.(i) <- String.sub t.s (p1 - 1) (p2 - p1) + res.(i) <- String.sub t.s p1 (p2 - p1) end done; res diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/pcre.ml new/ocaml-re-1.11.0/lib/pcre.ml --- old/ocaml-re-1.10.4/lib/pcre.ml 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/pcre.ml 2023-08-19 12:51:09.000000000 +0200 @@ -1,8 +1,11 @@ module Re = Core +exception Parse_error = Perl.Parse_error +exception Not_supported = Perl.Not_supported + type regexp = Re.re -type flag = [ `CASELESS | `MULTILINE | `ANCHORED ] +type flag = [ `CASELESS | `MULTILINE | `ANCHORED | `DOTALL ] type split_result = | Text of string @@ -17,6 +20,7 @@ | `CASELESS -> `Caseless | `MULTILINE -> `Multiline | `ANCHORED -> `Anchored + | `DOTALL -> `Dotall ) flags in Perl.re ~opts pat @@ -31,6 +35,23 @@ let get_substring s i = Re.Group.get s i +let names rex = + Re.group_names rex + |> List.map fst + |> Array.of_list + +let get_named_substring rex name s = + let rec loop = function + | [] -> raise Not_found + | (n, i) :: rem when n = name -> + begin + try get_substring s i + with Not_found -> loop rem + end + | _ :: rem -> loop rem + in + loop (Re.group_names rex) + let get_substring_ofs s i = Re.Group.offset s i diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/pcre.mli new/ocaml-re-1.11.0/lib/pcre.mli --- old/ocaml-re-1.10.4/lib/pcre.mli 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/pcre.mli 2023-08-19 12:51:09.000000000 +0200 @@ -1,6 +1,9 @@ +exception Parse_error +exception Not_supported + type regexp = Core.re -type flag = [ `CASELESS | `MULTILINE | `ANCHORED ] +type flag = [ `CASELESS | `MULTILINE | `ANCHORED | `DOTALL ] type groups = Core.Group.t @@ -26,6 +29,12 @@ val get_substring : groups -> int -> string (** Equivalent to {!Core.Group.get}. *) +val names : regexp -> string array +(** Return the names of named groups. *) + +val get_named_substring : regexp -> string -> groups -> string +(** Return the first matched named group, or raise [Not_found]. *) + val get_substring_ofs : groups -> int -> int * int (** Equivalent to {!Core.Group.offset}. *) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/perl.ml new/ocaml-re-1.11.0/lib/perl.ml --- old/ocaml-re-1.10.4/lib/perl.ml 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/perl.ml 2023-08-19 12:51:09.000000000 +0200 @@ -108,6 +108,11 @@ r end else if accept '#' then begin comment () + end else if accept '<' then begin + let name = name () in + let r = regexp () in + if not (accept ')') then raise Parse_error; + Re.group ~name r end else raise Parse_error end else begin @@ -128,7 +133,7 @@ end else if accept '\\' then begin (* XXX - Back-references - - \cx (control-x), \e, \f, \n, \r, \t, \xhh, \ddd + - \cx (control-x), \ddd *) if eos () then raise Parse_error; match get () with @@ -156,6 +161,21 @@ Re.eos | 'G' -> Re.start + | 'e' -> + Re.char '\x1b' + | 'f' -> + Re.char '\x0c' + | 'n' -> + Re.char '\n' + | 'r' -> + Re.char '\r' + | 't' -> + Re.char '\t' + | 'x' -> + let c1 = hexdigit () in + let c2 = hexdigit () in + let code = c1 * 16 + c2 in + Re.char (char_of_int code) | 'a'..'z' | 'A'..'Z' -> raise Parse_error | '0'..'9' -> @@ -168,6 +188,13 @@ '*' | '+' | '?' | '{' | '\\' -> raise Parse_error | c -> Re.char c end + and hexdigit () = + if eos () then raise Parse_error; + match get () with + '0'..'9' as d -> Char.code d - Char.code '0' + | 'a'..'f' as d -> Char.code d - Char.code 'a' + 10 + | 'A'..'F' as d -> Char.code d - Char.code 'A' + 10 + | _ -> raise Parse_error and integer () = if eos () then None else match get () with @@ -182,6 +209,22 @@ integer' i' | _ -> unget (); Some i + and name () = + if eos () then raise Parse_error else + match get () with + ('_' | 'a'..'z' | 'A'..'Z') as c -> + let b = Buffer.create 32 in + Buffer.add_char b c; + name' b + | _ -> raise Parse_error + and name' b = + if eos () then raise Parse_error else + match get () with + ('_' | 'a'..'z' | 'A'..'Z' | '0'..'9') as c -> + Buffer.add_char b c; + name' b + | '>' -> Buffer.contents b + | _ -> raise Parse_error and bracket s = if s <> [] && accept ']' then s else begin match char () with diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib/posix.mli new/ocaml-re-1.11.0/lib/posix.mli --- old/ocaml-re-1.10.4/lib/posix.mli 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib/posix.mli 2023-08-19 12:51:09.000000000 +0200 @@ -48,7 +48,7 @@ ]} *) -(** XXX Character classes *) +(* XXX Character classes *) exception Parse_error exception Not_supported @@ -60,10 +60,10 @@ (** Parsing of a Posix extended regular expression *) val compile : Core.t -> Core.re -(** Regular expression compilation *) +(** [compile r] is defined as [Core.compile (Core.longest r)] *) val compile_pat : ?opts:(opt list) -> string -> Core.re -(** [compile r] is defined as [Core.compile (Core.longest r)] *) +(** [compile_pat ?opts regex] compiles the Posix extended regular expression [regexp] *) (* Deviation from the standard / ambiguities in the standard diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib_test/test_pcre.ml new/ocaml-re-1.11.0/lib_test/test_pcre.ml --- old/ocaml-re-1.10.4/lib_test/test_pcre.ml 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib_test/test_pcre.ml 2023-08-19 12:51:09.000000000 +0200 @@ -41,13 +41,21 @@ let sp = full_split ~rex "testxyyy" in assert_equal ~printer sp [Text "test"; Delim "x"; NoGroup; Text "yyy"] +let rex = regexp "(?<many_x>x+)" + +let named_groups _ = + let s = exec ~rex "testxxxyyy" in + assert_equal (get_named_substring rex "many_x" s) "xxx" + let test_fixtures = "test pcre features" >::: [ "test [:blank:] class" >:: test_blank_class ; "test splitting empty string" >:: split_empty ; "test split with max of 1" >:: split_max_1 ; "test group split 1" >:: group_split1 - ; "test group split 2 - NoGroup" >:: group_split2] + ; "test group split 2 - NoGroup" >:: group_split2 + ; "test named groups" >:: named_groups + ] let _ = run_test_tt_main test_fixtures diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/lib_test/test_re.ml new/ocaml-re-1.11.0/lib_test/test_re.ml --- old/ocaml-re-1.10.4/lib_test/test_re.ml 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/lib_test/test_re.ml 2023-08-19 12:51:09.000000000 +0200 @@ -480,4 +480,48 @@ test "exec_partial 7" (str "") "hello" `Full; test "exec_partial 8" (whole_string (str "hello")) "" `Partial; + let test ?pos msg re input expected = + let extract_groups = function + | `Full group -> `Full (Group.all_offset group) + | `Partial _ | `Mismatch as result -> result + in + expect_pass msg (fun () -> + expect_equal_app id expected extract_groups (exec_partial_detailed ?pos (compile re) input) + ~printer:(function + | `Partial position -> Printf.sprintf "`Partial %d" position + | `Full groups -> + Array.to_list groups + |> List.map (fun (a, b) -> Printf.sprintf "%d,%d" a b) + |> String.concat ";" + |> Printf.sprintf "`Full [|%s|]" + | `Mismatch -> "`Mismatch")) + in + test "exec_partial_detailed 1" (str "hello") "he" (`Partial 0); + (* Because of how the matching engine currently works, situations where + the entirety of the input string cannot be a match like the test below + actually return the last character as a potential start instead of just + return `Partial (String.length input). This is still fine however as + it still respects the mli contract, as no match could start before + the given position, and is fine in practice as testing an extra + character on extra input doesn't add much more in terms of workload. + *) + test "exec_partial_detailed 2" (str "hello") "goodbye" (`Partial 6); + test "exec_partial_detailed 3" (str "hello") "hello" (`Full [|0, 5|]); + test "exec_partial_detailed 4" (whole_string (str "hello")) "hello" (`Full [|0, 5|]); + test "exec_partial_detailed 5" (whole_string (str "hello")) "goodbye" `Mismatch; + test "exec_partial_detailed 6" (str "hello") "" (`Partial 0); + test "exec_partial_detailed 7" (str "") "hello" (`Full [|0, 0|]); + test "exec_partial_detailed 8" (whole_string (str "hello")) "" (`Partial 0); + test "exec_partial_detailed 9" (str "abc") ".ab.ab" (`Partial 4); + test "exec_partial_detailed 10" + ~pos:1 + (seq [ not_boundary; str "b"]) + "ab" + (`Full [|1, 2|]); + test "exec_partial_detailed 11" + (seq [ group (str "a"); rep any; group (str "b")]) + ".acb." + (`Full [|1, 4; 1, 2; 3, 4|]); + + run_test_suite "test_re" diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/ocaml-re-1.10.4/re.opam new/ocaml-re-1.11.0/re.opam --- old/ocaml-re-1.10.4/re.opam 2022-04-27 20:13:14.000000000 +0200 +++ new/ocaml-re-1.11.0/re.opam 2023-08-19 12:51:09.000000000 +0200 @@ -8,7 +8,7 @@ "Rudi Grinberg" "Gabriel Radanne" ] -license: "LGPL-2.0-or-later WITH OCaml-LGPL-linking-exception" +license: "LGPL-2.1-or-later WITH OCaml-LGPL-linking-exception" homepage: "https://github.com/ocaml/ocaml-re" bug-reports: "https://github.com/ocaml/ocaml-re/issues" dev-repo: "git+https://github.com/ocaml/ocaml-re.git"