1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
|
Remove duplicated sequences from an alignment
---------------------------------------------
The ``omit_duplicated`` app removes redundant sequences from a sequence collection (aligned or unaligned).
Let's create sample data with duplicated sequences.
.. jupyter-execute::
:raises:
from cogent3 import make_aligned_seqs
data = {
"a": "ACGT",
"b": "ACG-", # identical to 'a' except has a gap
"c": "ACGG", # duplicate
"d": "ACGG", # duplicate
"e": "AGTC", # unique
}
aln = make_aligned_seqs(data, moltype="dna")
aln
Creating the ``omit_duplicated`` app with the argument ``choose="longest"`` selects the duplicated sequence with the least number of gaps and ambiguous characters. In the above example, only one of ``c`` and ``d`` will be retained.
.. jupyter-execute::
:raises:
from cogent3 import get_app
omit_shorter_duplicate = get_app("omit_duplicated", moltype="dna", choose="longest")
omit_shorter_duplicate(aln)
Creating the ``omit_duplicated`` app with the argument ``choose=None`` means only unique sequences are retained.
.. jupyter-execute::
:raises:
from cogent3 import get_app
omit_all_duplicates = get_app("omit_duplicated", moltype="dna", choose=None)
omit_all_duplicates(aln)
The ``mask_degen`` argument specifies how to treat matches between sequences with degenerate characters.
Let's create sample data that has a DNA ambiguity code.
.. jupyter-execute::
:raises:
from cogent3 import make_aligned_seqs
aln = make_aligned_seqs(
{
"s1": "ATCG",
"s2": "ATYG", # matches s1 with ambiguity
"s3": "GGTA",
},
moltype="dna",
)
Since "Y" represents pyrimidines where the site can be either "C" or "T", s1 indeed matches s2 and one of them will be removed.
.. jupyter-execute::
:raises:
from cogent3 import get_app
app_dna = get_app("omit_duplicated", mask_degen=True, choose="longest", moltype="dna")
app_dna(aln)
|