1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
|
#!/usr/bin/env perl -w
# phylofoot.pl
# by Boris Lenhard
#
# See POD documentation for this script at the end of the file
#
use strict;
use Getopt::Long; # for parsing command line arguments
use Pod::Usage;
use TFBS::DB::FlatFileDir;
# DEFAULTS - presumes scripts are run from example/ directory
# in the TFBS distribution
# default values for optional parameters
my $CONSERVATION_PERCENT = 70;
my $MATRIX_THRESHOLD_PERCENT = 80;
my $SLIDING_WINDOW = 50;
# Get command line options - if you are curious how this
# works, check the Getopt::Long module documentation.
my ($database_dir, $alignment_file, $help);
my @matrix_IDs = ();
my $conservation = $CONSERVATION_PERCENT;
my $threshold = $MATRIX_THRESHOLD_PERCENT;
my $window = $SLIDING_WINDOW;
GetOptions('help' => \$help,
'alignment_file=s' => \$alignment_file,
'database=s' => \$database_dir,
'matrix-id:s' => \@matrix_IDs,
'conservation:f' => \$conservation,
'threshold-score:f' => \$threshold,
'window-size:i' => \$window
);
if($help) {
pod2usage(-exitstatus=>0, -verbose=>2);
}
elsif (!($alignment_file and $database_dir)) {
pod2usage(1);
}
# parse both the comma-separated lists and individually specified
# matrix IDs:
@matrix_IDs = split (",", join(',',@matrix_IDs));
# connect to FlatFileDir matrix database
# (there is a sample FlatFileDir matrix database directory
# examples/SAMPLE_FlatFileDir in the TFBS distribution package)
# Change this line if you want to use a different type of database
# (e.g. TFBS::DB::JASPAR2)
my $db = TFBS::DB::FlatFileDir->connect($database_dir);
# slurp the matrices into a TFBS::MatrixSet;
my $matrixset;
unless (scalar @matrix_IDs) { # if none are selected, we already have the set:
$matrixset = $db->get_MatrixSet(-matrixtype=>"PWM"); # retrieves all matrices from the db
}
else { # otherwise we retrieve inly the requested ones
$matrixset = $db->get_MatrixSet(-IDs => \@matrix_IDs,
-matrixtype=>"PWM");
}
# do the phylogenetic footprinting search of the sites
my $sitepairset = $matrixset->search_aln(-file=>$alignment_file,
-cutoff=>"$conservation",
-threshold=>"$threshold\%",
-windowsize => $window);
# print it out in GFF format
print $sitepairset->GFF;
# ANd that's it. Instead of simply printing GFF, you can e.g.
# - iterate through individual detected binding site pairs and extract
# the information you need
# - attach individual sittes as Bio::SeqFeature subclass objects
# to bioperl's Bio::Seq objects
# - collect the hit sequences and produce derived pattern matrices
# using TFBS::Pa
# - you tell me...
# The rest is usage message if the user requests help
# or fails to provide required parameters
__END__
=head1 NAME
phylofoot.pl - Phylogenetic footprinting example script
=head1 SYNOPSIS
./phylofoot.pl -a <alignment_file> -d <TFBS_matrix_dbase_dir> [other_options]
=head1 OPTIONS
=over 8
=item B<-d or --database> <directory name>
REQUIRED: Name of the FlatFileDir database directory to
use for retrieving matrices.
A sample database directory examples/SAMPLE_FlatFileDir
is available in TFBS distribution.
=item B<-a or --alignment-file> <alignment file name>
REQUIRED: Name of the pairwise alignment file in Clustal
format.
A sample database directory examples/sample_alignment.aln
is available in TFBS distribution.
=item B<-m or --matrix-id> <list of matrix IDs>
OPTIONAL: ID of the matrix from the database to scan the
alignment with.You can specify multiple matrices using
multiple -m switches or a single comma-separated lists of
IDs (NO spaces - e.g. -m M00001,M00021,N01921 ). You can
use a script called examples/list_matrices.pl in TFBS distribution
to list information for all matrices in a matrix database
of the FlatFileDir type.
DEFAULT: If no matrix IDs are specified, all matrices in
the database are used for the search;
=item B<-w or --window-size> <integer value>
OPTIONAL: The width of sliding window for calculating
the conservation profile of the submitted pairwise alignment.
DEFAULT: If not specified, the default value is 50 (nucleotides).
=item B<-c or --conservation> <percent value>
OPTIONAL: Conservation cutoff (in percent) for a region of
multiple alignment to include detected conserved sites into output.
DEFAULT: If not specified, the default value is 70 (%).
=item B<-t or --threshold-score> <percent value>
OPTIONAL: Threshold score (in percent) for a matrix match to
a subsequence.
DEFAULT: If not specified, the default value is 80 (%).
=back
=head1 DESCRIPTION
This is an example script that scans conserved regions of a
pairwise DNA sequence alignment with a set of matrices form
a flat file databases and produces GFF output. Its source code is
meant to be studied by bioinformaticians who wish to learn how to
use TFBS modules.
=cut
|