File: align_plot_tree_min3.py

package info (click to toggle)
srst2 0.2.0-9
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 8,976 kB
  • sloc: python: 3,115; sh: 50; makefile: 29
file content (54 lines) | stat: -rwxr-xr-x 1,883 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

import rpy2.robjects as robjects
r = robjects.r

from Bio.Align.Applications import MuscleCommandline
import os
import subprocess as sub

fasta_directory = "/home/UNIMELB/hdashnow/resistance_database/by_gene"

# Get the filenames of all files in the input directory "fasta_directory"
files = []
for f in os.listdir(fasta_directory):
        if not f.startswith("."):
            if f.endswith(".fsa") or f.endswith(".fasta"):
                files.append('"'+os.path.join(fasta_directory,f)+'"') # need to surround with " " due to () in filenames
print("Number of input files:", len(files))

# Run muscle on each fasta file to produce an alignment for each
# Save the alignment file names so that they can be used in R ape
for f in files:
        outfilename = (f+".aln").replace("(","").replace(")","")
        #print outfilename
        alignment = MuscleCommandline(input=f, out=outfilename)
        #alignment()


r('''
        require(ape, quietly=TRUE)

        all_files = list.files("/home/UNIMELB/hdashnow/resistance_database/by_gene",
                       pattern="aln$", full.names=TRUE)

        pdf(width=9,height=12,file="trees.pdf")

        for (filename in all_files) {
                #print(filename)
                aln = read.dna(filename, format="fasta")
                if (dim(aln)[1] >= 3) {
                    d = dist(aln)
                    #print(aln)
                    tree=nj(d)
                    plot(tree, cex=0.6)
                    #print(filename)
                    min_filename = tail(strsplit(filename,"/")[[1]],1) # This needs to be made robust
                    #print(min_filename)
                    gene_name = strsplit(min_filename,"\\\.")[[1]][1]
                    #print(cluster)
                    title(paste("Gene: ", gene_name))
                        #dev.off()
                }
        }
        dev.off()
''')