#!/usr/sbin/perl
if (!@ARGV) {
print STDERR "usage: $0 alignment_file
[threshold%]...\n";
print_sets();
exit 0;
}
my $FILE = shift
@ARGV;
my @THRESHOLD;
if (@ARGV) {
@THRESHOLD = @ARGV;
} else {
@THRESHOLD = (90, 80, 70, 60,
50);
}
my @ID;
my @ALIGNMENT;
read_alignment($FILE);
$n=@ID;
for ($i=0; $i<$n; $i++){
printf "%-15s %s\n", $ID[$i], join("", @{$ALIGNMENT{$ID[$i]}});
}
sub read_alignment {
my ($file) = @_;
my ($id, $line, %alignment);
local (*TMP);
open(TMP, "< $file") or die
"can't open file '$file'\n";
while ($line = <TMP>) {
next
if $line =~ /^CLUSTAL/;
if ($line =~ /^([^ ]+)\s+([-a-zA-Z*.]+) *$/) {
if (! $alignment{$1}) {
# new sequence identifier
push (@ID, $1);
}
#strip spaces,tabs,newlines: extend alignment array
$line = $2;
$line =~ tr/ \t\n//d;
push (@{$ALIGNMENT{$1}}, split("", $line));
}
}
close TMP;
}
If the program was working properly the subroutine read_alignment should be parsing the input file line by line and holding in the array @ID the identifier of the sequences only if the identifier has not been seen before (sequence identifiers are in the first column of the input file). However, if I print the elements of @ID, this will contain the same sequence identifier repeated as many times as it appeared in the input file. The statement "if (! $alignment{$1})" should have been taking care of this but, at least in my machine, it is not. The attached file contain an input file, so you can check what I am saying.Any help will be welcomed,
Pedro Reche
-- *************************************************************************** PEDRO a. RECHE gallardo, pHD TL: 617 632 3824 Scientist, Mol.Immnunol.Foundation, FX: 617 632 3351 Dana-Farber Cancer Institute, EM: [EMAIL PROTECTED] Harvard Medical School, URL: http://www.reche.org 44 Binney Street, D610C, Boston, MA 02115 ***************************************************************************
CLUSTAL W(1.60) multiple sequence alignment
YPK1 SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV
YPK2 KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI
KPCA_HUMAN RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANI
KPCZ_HUMAN RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRI
KAPA KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL
KAPC NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL
KAPB SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY
KS6_HUMAN RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSES
KPC1 RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ
KRAC_BOVIN ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEG
SCH9 ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTAT
KGP1_DROME LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA
ARK2_RAT KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPL
DBFB AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAM
DBF2 ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAM
* *.
YPK1 ------QKQF
YPK2 ------QKQF
KPCA_HUMAN D-----QSDF
KPCZ_HUMAN D-----QSEF
KAPA ------FRDF
KAPC ------MKEF
KAPB ------FQDF
KS6_HUMAN A-----NQVF
KPC1 ------QEEF
KRAC_BOVIN VDS-ERRPHF
SCH9 PLSPAMQAKF
KGP1_DROME --------DF
ARK2_RAT MISERWQQEV
DBFB VDDSAVDSKL
DBF2 VDDSAVSSKL
