Hello
I'm sorry, i tried to prepare shorter example, and mixed loops & recurs.
Full example is attached. This is code, that implements something like
'strings' command on Unixes, but for UTF-8 encoding
The problem is, that i need to return from function 'read-utf-char', either
String, either Integer, depending on condition. Or this is not allowed by
Clojure? I mostly programming in Scheme, that allows such tricks
.Bill Smith at "Wed, 30 Dec 2009 11:28:54 -0800 (PST)" wrote:
.S> Sorry, I'm confused by the code sample. I see several loops but no
.S> corresponding recurs.
--
You received this message because you are subscribed to the Google
Groups "Clojure" group.
To post to this group, send email to clojure@googlegroups.com
Note that posts from new members are moderated - please be patient with your
first post.
To unsubscribe from this group, send email to
clojure+unsubscr...@googlegroups.com
For more options, visit this group at
http://groups.google.com/group/clojure?hl=en
(ns test1
(:import (java.io File InputStream FileInputStream InputStreamReader))
(:import (java.nio.charset Charset))
(:use [clojure.contrib.def :only (defvar-)])
)
(defn is-ascii? [#^Integer n]
(or (and (>= n 32) (<= n 126)) ;; 0x20-0x7e
(== n 9) ;; \t, \n, \v, \f, \r
(== n 10)
(== n 13)
))
(defvar- utf-8-locale (Charset/forName "UTF-8"))
(defvar- latin-1-locale (Charset/forName "ISO-8859-1"))
;;
(defn- clear-strbuf [#^StringBuffer strbuf]
(.delete strbuf 0 (.length strbuf)))
(defn- append-from-strbuf [lst
#^StringBuffer strbuf
#^Integer n]
(if (>= (.length strbuf) n)
(let [str (.toString strbuf)]
(clear-strbuf strbuf)
(cons str lst))
(do
(clear-strbuf strbuf)
lst)))
(defn- detect-utf-n [#^Integer char]
(cond
(< char 0xC0) 0
(= (bit-and char 0xE0) 0xC0) 1 ;; 2-bytes seq
(= (bit-and char 0xF0) 0xE0) 2 ;; 3-bytes seq
(= (bit-and char 0xF8) 0xF0) 3 ;; 4-bytes seq
:else 0))
(defn- read-utf-char [#^InputStreamReader istream
#^Integer char
#^Integer utf-n]
(let [barr (make-array Byte/TYPE (+ 1 utf-n))]
(aset-byte barr 0 char)
;; loop over rest of characters, checking their validity
(loop [cnt 0]
(if (= cnt utf-n)
(String. barr utf-8-locale) ;; if we read all characters
(let [ch (.read istream) ;; read rest of characters
ncnt (+ 1 cnt)]
(if (= (bit-and ch 0xC0) 0x80)
(do
(aset-byte barr ncnt ch)
(recur ncnt))
ch))))))
(defn- extract-text-utf-8
"Performs text extraction for UTF-8 encoding"
[#^InputStream istream
#^Integer n
]
(let [#^InputStreamReader ireader (new InputStreamReader istream
latin-1-locale)
#^StringBuffer strbuf (new StringBuffer)
]
(loop [lst '()
char (.read ireader)]
(let [utf-n (detect-utf-n char)]
(cond
(== char -1) (reverse (append-from-strbuf lst strbuf n))
(is-ascii? char) (do
(.append strbuf (Character/toChars char))
(recur lst (.read ireader)))
(> utf-n 0) (let [res (read-utf-char ireader char utf-n)]
(if (string? res)
(do
(.append strbuf res)
(recur lst (.read ireader)))
;; TODO: don't forget to fix this (.read istream) <->
res issue
(recur (append-from-strbuf lst strbuf n)
(.read ireader))))
:else (recur (append-from-strbuf lst strbuf n)
(.read ireader)))))))
(defn analyse-stream
"Performs analysis of given file and extract text in given charset"
[#^InputStream stream
#^Integer n
#^String locale-name]
(extract-text-utf-8 stream n))
(defn analyse-file
"Performs analysis of given file and extract text in given charset"
[#^File file
#^Integer n
#^String locale-name]
(analyse-stream (new FileInputStream file) n locale-name))
--
With best wishes, Alex Ott, MBA
http://alexott.blogspot.com/ http://xtalk.msk.su/~ott/
http://alexott-ru.blogspot.com/