tomas wrote: > If you are doing this in Emacs Lisp, after all, there /is/ > a Levenshtein distance function in there. Finding its name > is left as an exercise to the reader, though...
(I know of course, `string-distance'.) I thought I was just going to experiment some in Elisp but now I've done the whole multi-test idea with it, mostly to show the idea and I wanted to realize something practically. I yank the source last, before that, an example run and output. It is not done but you will get it. Maybe one should use CL for this if Elisp and/or the programmer's skills are such, it will be all to slow on big data. One can start with optimizing the Elisp but it can be an uphill battle if it is way too slow. No idea if that is the case, only tried with small data. On the TODO list! But again, I wanted to show the idea and that it works. There are only three tests, but one can add more. As for `string-distance', I was hesitant if that should be added since it isn't the way a human would think, and the idea was to make it similar to the human concept of originality. But now - I don't know, maybe I'll just include every single test that makes sense and I can think of? -*- Example run. "min" means, it isn't original compared to this. "max" means, it is. Yes, I only tried with 4 sentences so far :) TOFT - originality finder tool String: "duncan is a dork" min: 30% - In a random string. max: 64% - What's up what's up wanna party or n... sum: 250 avg: 50% Data executed on: What's up what's up wanna party or not say. Maybe to some people, it is true. In a random string. I know right? -*- Source - not done but works: ;;; -*- lexical-binding: t -*- ;; ;; this file: ;; https://dataswamp.org/~incal/emacs-init/b-a-original.el (require 'cl-lib) (require 'pcase) (require 'thingatpt) ;; useful (defun string-words (str &optional no-sort) (let ((words (split-string (downcase str) "[[:space:]()]+" t "[[:punct:]]+"))) (if no-sort words (sort words) ))) (defun buffer-sentences (&optional beg end no-sort) (or beg (setq beg (point-min))) (or end (setq end (point-max))) (save-mark-and-excursion (goto-char beg) (cl-loop with buf-strs with next-str while (< (point) end) do (setq next-str (sentence-at-point t)) (when next-str (push next-str buf-strs) ) (forward-sentence) finally return (if no-sort buf-strs (sort buf-strs)) ))) ;; interface (defun prepare-string (str) (format "%s\n" (truncate-string-to-width (string-trim (replace-regexp-in-string "\n" " " str)) 50 nil nil "...") )) (defun print-results (res &optional init) (let ((buf-dst (get-buffer-create "*original*"))) (with-current-buffer buf-dst (erase-buffer) (insert "TOFT - originality finder tool\n") (when init (insert "\n" (prepare-string (format "String: \"%s\"" init)))) (insert "\n") (insert (prepare-string (format "min: %d%% - %s" (nth 0 res) (nth 1 res)))) (insert (prepare-string (format "max: %d%% - %s" (nth 2 res) (nth 3 res)))) (insert (format "sum: %d\n" (nth 4 res))) (insert (format "avg: %d%%\n" (nth 5 res))) (goto-char (point-min))) (pop-to-buffer buf-dst) )) (defun string-compare-buffer (str &optional beg end) (interactive (list (read-string "string: ") (when (use-region-p) (list (region-beginning) (region-end))) )) (or beg (setq beg (point-min))) (or end (setq end (point-max))) (print-results (string-compare-many str (buffer-sentences beg end)) str) ) ; TODO: fix ;; main test (defun string-compare-many (str str-all) (cl-loop with min with max with min-str with max-str with sum = 0 with mean with s-score for s in str-all do (setq s-score (string-compare str s)) (cl-incf sum s-score) (when (or (not min) (< s-score min)) (setq min s-score) (setq min-str s) ) (when (or (not max) (< max s-score)) (setq max s-score) (setq max-str s) ) finally (setq mean (floor (/ sum (length str-all) 1.0))) finally return (list min min-str max max-str sum mean) )) (defun string-compare (s1 s2 &optional full) (let ((tests (list #'length-test #'number-of-words-test #'same-words-test ))) (cl-loop with res with score = 0 with f-score for f in tests do (setq f-score (apply f (list s1 s2))) (push f-score res) (cl-incf score f-score) finally (setq score (floor (/ score (length tests) 1.0))) finally return (if full res score) ))) ;; individual tests, helpers (defun originality-score (a b) (when (zerop a) (setq a 1)) (when (zerop b) (setq b 1)) (pcase-let*((`(,n ,d) (if (< a b) `(,a ,b) `(,b ,a)))) (floor (- 100 (* 100 (/ n d 1.0))) ))) ;; individual tests (defun length-test (s1 s2) (let ((l1 (length s1)) (l2 (length s2)) ) (originality-score l1 l2) )) (defun number-of-words-test (s1 s2) (let ((nw1 (length (string-split s1))) (nw2 (length (string-split s2))) ) (originality-score nw1 nw2) )) (defun same-words-test (s1 s2) (let*((w1 (string-words s1)) (w2 (string-words s2)) (len-w1 (length w1)) (len-w2 (length w2)) (len-short (if (< len-w1 len-w2) len-w1 len-w2)) (same (length (cl-intersection w1 w2 :test #'string=))) ) (originality-score len-short same) )) (provide 'b-a-original) -- underground experts united https://dataswamp.org/~incal