tomas wrote:

> If you are doing this in Emacs Lisp, after all, there /is/
> a Levenshtein distance function in there. Finding its name
> is left as an exercise to the reader, though...

(I know of course, `string-distance'.)

I thought I was just going to experiment some in Elisp but now
I've done the whole multi-test idea with it, mostly to show the
idea and I wanted to realize something practically.

I yank the source last, before that, an example run
and output. It is not done but you will get it.

Maybe one should use CL for this if Elisp and/or the
programmer's skills are such, it will be all to slow on
big data. One can start with optimizing the Elisp but it can
be an uphill battle if it is way too slow. No idea if that is
the case, only tried with small data. On the TODO list!

But again, I wanted to show the idea and that it works.
There are only three tests, but one can add more. As for
`string-distance', I was hesitant if that should be added
since it isn't the way a human would think, and the idea was
to make it similar to the human concept of originality.
But now - I don't know, maybe I'll just include every single
test that makes sense and I can think of?

-*-

Example run. "min" means, it isn't original compared to this.
"max" means, it is. Yes, I only tried with 4 sentences so far :)

TOFT - originality finder tool

String: "duncan is a dork"

min: 30% - In a random string.
max: 64% - What's up what's up wanna party or n...
sum: 250
avg: 50%

Data executed on:

  What's up what's up wanna party or not say. Maybe to some
  people, it is true. In a random string. I know right?

-*-

Source - not done but works:

;;; -*- lexical-binding: t -*-
;;
;; this file:
;;   https://dataswamp.org/~incal/emacs-init/b-a-original.el

(require 'cl-lib)
(require 'pcase)
(require 'thingatpt)

;; useful

(defun string-words (str &optional no-sort)
  (let ((words (split-string (downcase str) "[[:space:]()]+" t "[[:punct:]]+")))
    (if no-sort
        words
      (sort words) )))

(defun buffer-sentences (&optional beg end no-sort)
  (or beg (setq beg (point-min)))
  (or end (setq end (point-max)))
  (save-mark-and-excursion
    (goto-char beg)
    (cl-loop
      with buf-strs
      with next-str
      while (< (point) end) do
        (setq next-str (sentence-at-point t))
        (when next-str
          (push next-str buf-strs) )
        (forward-sentence)
      finally return (if no-sort buf-strs (sort buf-strs)) )))

;; interface

(defun prepare-string (str)
  (format "%s\n"
    (truncate-string-to-width
      (string-trim (replace-regexp-in-string "\n" " " str))
        50 nil nil "...") ))

(defun print-results (res &optional init)
  (let ((buf-dst (get-buffer-create "*original*")))
    (with-current-buffer buf-dst
      (erase-buffer)
      (insert "TOFT - originality finder tool\n")
      (when init
        (insert "\n" (prepare-string (format "String: \"%s\"" init))))
      (insert "\n")
      (insert (prepare-string (format "min: %d%% - %s" (nth 0 res) (nth 1 
res))))
      (insert (prepare-string (format "max: %d%% - %s" (nth 2 res) (nth 3 
res))))
      (insert (format "sum: %d\n" (nth 4 res)))
      (insert (format "avg: %d%%\n" (nth 5 res)))
      (goto-char (point-min)))
    (pop-to-buffer buf-dst) ))

(defun string-compare-buffer (str &optional beg end)
  (interactive
    (list (read-string "string: ")
          (when (use-region-p) (list (region-beginning) (region-end))) ))
  (or beg (setq beg (point-min)))
  (or end (setq end (point-max)))
  (print-results
    (string-compare-many str (buffer-sentences beg end))
    str) ) ; TODO: fix

;; main test

(defun string-compare-many (str str-all)
   (cl-loop
     with min
     with max
     with min-str
     with max-str
     with sum = 0
     with mean
     with s-score
     for s in str-all do
       (setq s-score (string-compare str s))
       (cl-incf sum s-score)
       (when (or (not min) (< s-score min))
         (setq min s-score)
         (setq min-str s) )
       (when (or (not max) (< max s-score))
         (setq max s-score)
         (setq max-str s) )
     finally (setq mean (floor (/ sum (length str-all) 1.0)))
     finally return (list min min-str max max-str sum mean) ))

(defun string-compare (s1 s2 &optional full)
  (let ((tests (list
                 #'length-test
                 #'number-of-words-test
                 #'same-words-test
                 )))
    (cl-loop
      with res
      with score = 0
      with f-score
      for f in tests do
        (setq f-score (apply f (list s1 s2)))
        (push f-score res)
        (cl-incf score f-score)
      finally (setq score (floor (/ score (length tests) 1.0)))
      finally return (if full res score) )))

;; individual tests, helpers

(defun originality-score (a b)
  (when (zerop a) (setq a 1))
  (when (zerop b) (setq b 1))
  (pcase-let*((`(,n ,d) (if (< a b) `(,a ,b) `(,b ,a))))
    (floor (- 100 (* 100 (/ n d 1.0))) )))

;; individual tests

(defun length-test (s1 s2)
  (let ((l1 (length s1))
        (l2 (length s2)) )
    (originality-score l1 l2) ))

(defun number-of-words-test (s1 s2)
  (let ((nw1 (length (string-split s1)))
        (nw2 (length (string-split s2))) )
    (originality-score nw1 nw2) ))

(defun same-words-test (s1 s2)
  (let*((w1 (string-words s1))
        (w2 (string-words s2))
        (len-w1 (length w1))
        (len-w2 (length w2))
        (len-short (if (< len-w1 len-w2) len-w1 len-w2))
        (same (length (cl-intersection w1 w2 :test #'string=))) )
    (originality-score len-short same) ))

(provide 'b-a-original)

-- 
underground experts united
https://dataswamp.org/~incal

Reply via email to