Sascha Steinbiss pushed to branch upstream at Debian Med / fastaq
Commits: e4dd7b7f by Sascha Steinbiss at 2018-02-22T10:22:16+01:00 New upstream version 3.17.0 - - - - - 3 changed files: - pyfastaq/sequences.py - pyfastaq/tests/sequences_test.py - setup.py Changes: ===================================== pyfastaq/sequences.py ===================================== --- a/pyfastaq/sequences.py +++ b/pyfastaq/sequences.py @@ -1,9 +1,8 @@ import copy import re -import string import random import itertools - +from collections import Counter from pyfastaq import utils, intervals, genetic_codes class Error (Exception): pass @@ -465,6 +464,47 @@ class Fasta: '''Returns a Fasta sequence, translated into amino acids. Starts translating from 'frame', where frame expected to be 0,1 or 2''' return Fasta(self.id, ''.join([genetic_codes.codes[genetic_code].get(self.seq[x:x+3].upper(), 'X') for x in range(frame, len(self)-1-frame, 3)])) + def gc_content(self, as_decimal=True): + """Returns the GC content for the sequence. + Notes: + This method ignores N when calculating the length of the sequence. + It does not, however ignore other ambiguous bases. It also only + includes the ambiguous base S (G or C). In this sense the method is + conservative with its calculation. + + Args: + as_decimal (bool): Return the result as a decimal. Setting to False + will return as a percentage. i.e for the sequence GCAT it will + return 0.5 by default and 50.00 if set to False. + + Returns: + float: GC content calculated as the number of G, C, and S divided + by the number of (non-N) bases (length). + + """ + gc_total = 0.0 + num_bases = 0.0 + n_tuple = tuple('nN') + accepted_bases = tuple('cCgGsS') + + # counter sums all unique characters in sequence. Case insensitive. + for base, count in Counter(self.seq).items(): + + # dont count N in the number of bases + if base not in n_tuple: + num_bases += count + + if base in accepted_bases: # S is a G or C + gc_total += count + + gc_content = gc_total / num_bases + + if not as_decimal: # return as percentage + gc_content *= 100 + + return gc_content + + class Embl(Fasta): '''Exactly the same as Fasta, but reading seqs from a file works differently''' ===================================== pyfastaq/tests/sequences_test.py ===================================== --- a/pyfastaq/tests/sequences_test.py +++ b/pyfastaq/tests/sequences_test.py @@ -520,6 +520,19 @@ class TestFasta(unittest.TestCase): fa = sequences.Fasta('name', 'A') fa.split_capillary_id() + def test_gc_content(self): + """Test GC content calculation works as expected""" + tests = [ + (sequences.Fasta('ID', 'cgCG'), 1.0), + (sequences.Fasta('ID', 'tTaA'), 0.0), + (sequences.Fasta('ID', 'GCAT'), 0.5), + (sequences.Fasta('ID', 'GCATNN'), 0.5), + (sequences.Fasta('ID', 'GCATNNS'), 0.6), + (sequences.Fasta('ID', 'GCATNNSK'), 0.5) + ] + for test, answer in tests: + self.assertAlmostEqual(test.gc_content(), answer) + self.assertAlmostEqual(test.gc_content(as_decimal=False), answer * 100) class TestEmbl(unittest.TestCase): def test_get_id_from_header_line(self): ===================================== setup.py ===================================== --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import setup, find_packages setup( name='pyfastaq', - version='3.16.0', + version='3.17.0', description='Script to manipulate FASTA and FASTQ files, plus API for developers', packages = find_packages(), author='Martin Hunt', View it on GitLab: https://salsa.debian.org/med-team/fastaq/commit/e4dd7b7f6d4f01fc0b193e609338be560d7e3bb8 --- View it on GitLab: https://salsa.debian.org/med-team/fastaq/commit/e4dd7b7f6d4f01fc0b193e609338be560d7e3bb8 You're receiving this email because of your account on salsa.debian.org.
_______________________________________________ debian-med-commit mailing list debian-med-commit@lists.alioth.debian.org http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/debian-med-commit