#! /home/users/tomc/bin/rebol -sq
rebol[title: "k-nucleotide"] ;;; your shebang path will vary

;;; read line-by-line a redirected FASTA format file from stdin
set-modes system/ports/input [lines: false binary: false]

;;; extract DNA sequence THREE
here: find/tail find/tail copy system/ports/input "^/>THREE " "^/"
fasta: replace/all here "^/" ""
len: length? fasta

k-nucl: func ["function to update a hash of k-nucleotide keys and count
values"
k [integer!] n[series!] hash [block!] /local l m ][
m: copy/part n k
either k = length? m
[either l: find hash/:k m
[l: next l change l 1 + first l]
[insert tail hash/:k m insert tail hash/:k 1]
][return]
]

;;; count all the 3- 4- 6- 12- and 18-nucleotide sequences,
kay: [1 2 3 4 6 12 18]
mers: make block! last kay ;;; could sort if not ordered
loop last kay [insert/only tail mers make hash![]]

forall fasta[ kay: head kay
forall kay[k-nucl first kay fasta mers]
]

;;; for all the 1-nucleotide and 2-nucleotide sequences,
;;; sorted by descending frequency and then ascending k-nucleotide key
repeat i 2[sort/skip mers/:i 2 sort/skip head reverse mers/:i 2
foreach [n c] head reverse mers/:i[print [n 100 * c / len ]] print ""
]

;;; write the count and code for specific sequences
foreach seq ["ggt" "ggta" "ggtatt" "ggtattttaatt" "ggtattttaatttatagt"][
l: length? seq ;;; newer REBOL versions would not need this line
print [either c: select mers/:l seq [c][0]tab seq]
]

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
my output ~ 7 seconds
% k-nucleotide.r < k-nucleotide-input.fasta
a 30.284
t 29.796
c 20.312
g 19.608

aa 9.212
at 8.95
tt 8.948
ta 8.936
ca 6.166
ct 6.1
ac 6.086
tc 6.042
ag 6.036
ga 5.968
tg 5.868
gt 5.798
cc 4.14
gc 4.044
cg 3.906
gg 3.798

562 ggt
152 ggta
15 ggtatt
0 ggtattttaatt
0 ggtattttaatttatagt


--
... nice weather eh

-- 
To unsubscribe from the list, just send an email to 
lists at rebol.com with unsubscribe as the subject.

Reply via email to