#! /home/users/tomc/bin/rebol -sq rebol[title: "k-nucleotide"] ;;; your shebang path will vary
;;; read line-by-line a redirected FASTA format file from stdin set-modes system/ports/input [lines: false binary: false] ;;; extract DNA sequence THREE here: find/tail find/tail copy system/ports/input "^/>THREE " "^/" fasta: replace/all here "^/" "" len: length? fasta k-nucl: func ["function to update a hash of k-nucleotide keys and count values" k [integer!] n[series!] hash [block!] /local l m ][ m: copy/part n k either k = length? m [either l: find hash/:k m [l: next l change l 1 + first l] [insert tail hash/:k m insert tail hash/:k 1] ][return] ] ;;; count all the 3- 4- 6- 12- and 18-nucleotide sequences, kay: [1 2 3 4 6 12 18] mers: make block! last kay ;;; could sort if not ordered loop last kay [insert/only tail mers make hash![]] forall fasta[ kay: head kay forall kay[k-nucl first kay fasta mers] ] ;;; for all the 1-nucleotide and 2-nucleotide sequences, ;;; sorted by descending frequency and then ascending k-nucleotide key repeat i 2[sort/skip mers/:i 2 sort/skip head reverse mers/:i 2 foreach [n c] head reverse mers/:i[print [n 100 * c / len ]] print "" ] ;;; write the count and code for specific sequences foreach seq ["ggt" "ggta" "ggtatt" "ggtattttaatt" "ggtattttaatttatagt"][ l: length? seq ;;; newer REBOL versions would not need this line print [either c: select mers/:l seq [c][0]tab seq] ] ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; my output ~ 7 seconds % k-nucleotide.r < k-nucleotide-input.fasta a 30.284 t 29.796 c 20.312 g 19.608 aa 9.212 at 8.95 tt 8.948 ta 8.936 ca 6.166 ct 6.1 ac 6.086 tc 6.042 ag 6.036 ga 5.968 tg 5.868 gt 5.798 cc 4.14 gc 4.044 cg 3.906 gg 3.798 562 ggt 152 ggta 15 ggtatt 0 ggtattttaatt 0 ggtattttaatttatagt -- ... nice weather eh -- To unsubscribe from the list, just send an email to lists at rebol.com with unsubscribe as the subject.
