File: phylofoot.pl

package info (click to toggle)
libtfbs-perl 0.7.0%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 1,772 kB
  • ctags: 921
  • sloc: perl: 7,883; ansic: 699; makefile: 562; sh: 9
file content (189 lines) | stat: -rw-r--r-- 5,001 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env perl -w

# phylofoot.pl
#   by Boris Lenhard
#
# See POD documentation for this script at the end of the file
#

use strict;
use Getopt::Long; # for parsing command line arguments
use Pod::Usage;
use TFBS::DB::FlatFileDir;

  # DEFAULTS - presumes scripts are run from example/ directory
  # in the TFBS distribution

  # default values for optional parameters

my $CONSERVATION_PERCENT = 70;
my $MATRIX_THRESHOLD_PERCENT = 80;
my $SLIDING_WINDOW = 50; 


  # Get command line options - if you are curious how this 
  # works, check the Getopt::Long module documentation.

my ($database_dir, $alignment_file, $help);
my @matrix_IDs   = ();
my $conservation = $CONSERVATION_PERCENT;
my $threshold    = $MATRIX_THRESHOLD_PERCENT;
my $window       = $SLIDING_WINDOW;

GetOptions('help'            => \$help,
	   'alignment_file=s'  => \$alignment_file,
	   'database=s'      => \$database_dir,
	   'matrix-id:s'     => \@matrix_IDs,
	   'conservation:f'    => \$conservation,
	   'threshold-score:f' => \$threshold,
	   'window-size:i'     => \$window
	   );

if($help)  {
    pod2usage(-exitstatus=>0, -verbose=>2);
}
elsif (!($alignment_file and $database_dir)) {
    pod2usage(1);
}

  # parse both the comma-separated lists and individually specified
  # matrix IDs:

@matrix_IDs = split (",", join(',',@matrix_IDs));


  # connect to FlatFileDir matrix database
  # (there is a sample FlatFileDir matrix database directory 
  # examples/SAMPLE_FlatFileDir in the TFBS distribution package)
  # Change this line if you want to use a different type of database
  # (e.g. TFBS::DB::JASPAR2)


my $db = TFBS::DB::FlatFileDir->connect($database_dir);

  # slurp the matrices into a TFBS::MatrixSet;

my $matrixset;

unless (scalar @matrix_IDs) { # if none are selected, we already have the set:
  
    $matrixset = $db->get_MatrixSet(-matrixtype=>"PWM"); # retrieves all matrices from the db
}
else { # otherwise we retrieve inly the requested ones

    $matrixset = $db->get_MatrixSet(-IDs => \@matrix_IDs,
				    -matrixtype=>"PWM");
}

  
  # do the phylogenetic footprinting search of the sites

my $sitepairset = $matrixset->search_aln(-file=>$alignment_file,
					 -cutoff=>"$conservation",
					 -threshold=>"$threshold\%",
					 -windowsize => $window);

  # print it out in GFF format

print $sitepairset->GFF;

  # ANd that's it. Instead of simply printing GFF, you can e.g.
  #     - iterate through individual detected binding site pairs and extract
  #       the information you need
  #     - attach individual sittes as Bio::SeqFeature subclass objects
  #       to bioperl's Bio::Seq objects
  #     - collect the hit sequences and produce derived pattern matrices
  #       using TFBS::Pa
  #     - you tell me...










# The rest is usage message if the user requests help 
# or fails to provide required parameters


__END__


=head1 NAME

phylofoot.pl - Phylogenetic footprinting example script

=head1 SYNOPSIS

./phylofoot.pl -a <alignment_file> -d <TFBS_matrix_dbase_dir> [other_options] 

=head1 OPTIONS

=over 8

=item B<-d   or --database>  <directory name>

REQUIRED: Name of the FlatFileDir database directory to 
use for retrieving matrices. 
A sample database directory examples/SAMPLE_FlatFileDir 
is available in TFBS distribution. 

=item B<-a   or --alignment-file> <alignment file name>

REQUIRED: Name of the pairwise alignment file in Clustal 
format.
A sample database directory examples/sample_alignment.aln 
is available in TFBS distribution. 

=item B<-m   or  --matrix-id> <list of matrix IDs>

OPTIONAL: ID of the matrix from the database to scan the 
alignment with.You can specify multiple matrices using 
multiple -m switches or a single comma-separated lists of 
IDs (NO spaces - e.g. -m M00001,M00021,N01921 ). You can 
use a script called examples/list_matrices.pl in TFBS distribution
to list information for all matrices in a matrix database 
of the FlatFileDir type.

DEFAULT: If no matrix IDs are specified, all matrices in 
the database are used for the search;

=item B<-w   or  --window-size> <integer value>

OPTIONAL: The width of sliding window for calculating
the conservation profile of the submitted pairwise alignment.

DEFAULT: If not specified, the default value is 50 (nucleotides).

=item B<-c   or  --conservation> <percent value>

OPTIONAL: Conservation cutoff (in percent) for a region of
multiple alignment to include detected conserved sites into output.

DEFAULT: If not specified, the default value is 70 (%).

=item B<-t   or  --threshold-score> <percent value>

OPTIONAL: Threshold score (in percent) for a matrix match to 
a subsequence.

DEFAULT: If not specified, the default value is 80 (%).

=back



=head1 DESCRIPTION

This is an example script that scans conserved regions of a 
pairwise DNA sequence alignment with a set of matrices form 
a flat file databases and produces GFF output. Its source code is 
meant to be studied by bioinformaticians who wish to learn how to 
use TFBS modules.


=cut