Hello

I'm sorry, i tried to prepare shorter example, and mixed loops & recurs.
Full example is attached.  This is code, that implements something like
'strings' command on Unixes, but for UTF-8 encoding

The problem is, that i need to return from function 'read-utf-char', either
String, either Integer, depending on condition.   Or this is not allowed by
Clojure?  I mostly programming in Scheme, that allows such tricks

.Bill Smith  at "Wed, 30 Dec 2009 11:28:54 -0800 (PST)" wrote:
 .S> Sorry, I'm confused by the code sample.  I see several loops but no
 .S> corresponding recurs.

-- 
You received this message because you are subscribed to the Google
Groups "Clojure" group.
To post to this group, send email to clojure@googlegroups.com
Note that posts from new members are moderated - please be patient with your 
first post.
To unsubscribe from this group, send email to
clojure+unsubscr...@googlegroups.com
For more options, visit this group at
http://groups.google.com/group/clojure?hl=en
(ns test1
  (:import (java.io File InputStream FileInputStream InputStreamReader))
  (:import (java.nio.charset Charset))
  (:use [clojure.contrib.def :only (defvar-)])
  )

(defn is-ascii? [#^Integer n]
  (or (and (>= n 32) (<= n 126)) ;; 0x20-0x7e
      (== n 9)       ;; \t, \n, \v, \f, \r
      (== n 10)
      (== n 13)
      ))

(defvar- utf-8-locale (Charset/forName "UTF-8"))
(defvar- latin-1-locale (Charset/forName "ISO-8859-1"))

;;
(defn- clear-strbuf [#^StringBuffer strbuf]
  (.delete strbuf 0 (.length strbuf)))

(defn- append-from-strbuf [lst
                           #^StringBuffer strbuf
                           #^Integer n]
  (if (>= (.length strbuf) n)
    (let [str (.toString strbuf)]
      (clear-strbuf strbuf)
       (cons str lst))
    (do
      (clear-strbuf strbuf)
      lst)))

(defn- detect-utf-n [#^Integer char]
  (cond
    (< char 0xC0) 0
    (= (bit-and char 0xE0) 0xC0) 1 ;; 2-bytes seq
    (= (bit-and char 0xF0) 0xE0) 2 ;; 3-bytes seq
    (= (bit-and char 0xF8) 0xF0) 3 ;; 4-bytes seq
    :else 0))

(defn- read-utf-char [#^InputStreamReader istream
                      #^Integer char
                      #^Integer utf-n]
  (let [barr (make-array Byte/TYPE (+ 1 utf-n))]
    (aset-byte barr 0 char)
    ;; loop over rest of characters, checking their validity
    (loop [cnt 0]
      (if (= cnt utf-n)
        (String. barr utf-8-locale) ;; if we read all characters
        (let [ch (.read istream) ;; read rest of characters
              ncnt (+ 1 cnt)]
          (if (= (bit-and ch 0xC0) 0x80)
            (do 
              (aset-byte barr ncnt ch)
              (recur ncnt))
            ch))))))

(defn- extract-text-utf-8
  "Performs text extraction for UTF-8 encoding"
  [#^InputStream istream
   #^Integer n
   ]
  (let [#^InputStreamReader ireader (new InputStreamReader istream 
latin-1-locale)
        #^StringBuffer strbuf (new StringBuffer)
        ]
    (loop [lst '()
           char (.read ireader)]
      (let [utf-n (detect-utf-n char)]
        (cond
          (== char -1) (reverse (append-from-strbuf lst strbuf n))
          (is-ascii? char) (do
                             (.append strbuf (Character/toChars char))
                             (recur lst (.read ireader)))
          (> utf-n 0) (let [res (read-utf-char ireader char utf-n)]
                        (if (string? res)
                          (do
                            (.append strbuf res)
                            (recur lst (.read ireader)))
                          ;; TODO: don't forget to fix this (.read istream) <-> 
res issue
                          (recur (append-from-strbuf lst strbuf n) 
                                 (.read ireader))))
          :else (recur (append-from-strbuf lst strbuf n)
                       (.read ireader)))))))

(defn analyse-stream
  "Performs analysis of given file and extract text in given charset"
  [#^InputStream stream
   #^Integer n
   #^String locale-name]
  (extract-text-utf-8 stream n))

(defn analyse-file
  "Performs analysis of given file and extract text in given charset"
  [#^File file
   #^Integer n
   #^String locale-name]
  (analyse-stream (new FileInputStream file) n locale-name))
-- 
With best wishes, Alex Ott, MBA
http://alexott.blogspot.com/        http://xtalk.msk.su/~ott/
http://alexott-ru.blogspot.com/

Reply via email to