File: expand_links.pl

package info (click to toggle)
fasta3 36.3.8h.2020-02-11-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,048 kB
  • sloc: ansic: 56,138; perl: 10,192; python: 2,205; sh: 416; csh: 85; sql: 55; makefile: 38
file content (101 lines) | stat: -rwxr-xr-x 3,275 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/perl

################################################################
# copyright (c) 2010, 2014 by William R. Pearson and The Rector &
# Visitors of the University of Virginia */
################################################################
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under this License is distributed on an "AS
# IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied.  See the License for the specific language
# governing permissions and limitations under the License. 
################################################################

## usage - expand_link.pl seed_acc.file > linked_fasta.file
##
## take a fasta36 -e expand.sh result file of the form:
## gi|12346|ref|NP_98765.1|<tab>1.1e-50
##
## and extract the accession number, looking it up from the an SQL
## table $table. This script uses the database created by link2sql.pl
## Code is included for linking to UniRef and as well as NCBI refseq
## searches.

## Once the linked accession numbers are found, the sequences are
## extracted from the SQL database seqdb_demo2 (see Mackey and Pearson
## (2004) Current Protocols in Bioinformatics (L. Stein, ed) "Using
## SQL databases for sequence similarity searching and analysis".
## Alternatively, one could use blastdbcmd or fastacmd to extract the
## sequences from an NCBI blast-formatted database.
##

use warnings;
use strict;
use DBI;

my ($host, $db, $port, $user, $pass, $table)  = ("xdb", "wrp_link", 0, "web_user", "fasta_www", "micr_samp_link50");

my $dbh = DBI->connect("dbi:mysql:host=$host:$db",
		       $user, $password,
                       { RaiseError => 1, AutoCommit => 1}
                      ) or die $DBI::errstr;

my %sth = ( 
    seed2link => "SELECT link_acc FROM $table WHERE seed_acc=?",
    link2seq => "SELECT * FROM seqdb_demo2.annot JOIN seqdb_demo2.protein USING(prot_id) WHERE acc=? AND pref=1"
    );

for my $sth (keys(%sth)) {
  $sth{$sth} = $dbh->prepare($sth{$sth});
}

my %acc_uniq = ();

while (my $line = <>) {
  chomp($line);
  my ($hit, $e_val) = split(/\t/,$line);
  processLine($hit,$sth{seed2link});
}

for my $acc ( keys %acc_uniq ) {

  $sth{link2seq}->execute($acc);
  while (my $row_href = $sth{link2seq}->fetchrow_hashref ) {
    print ">". $row_href->{db} . "|". $row_href->{acc} . " (micr_samp|$acc_uniq{$acc}) " .
      $row_href->{descr}. "\n";
    print $row_href->{seq} . "\n";
  }
  $sth{link2seq}->finish();
}

$dbh->disconnect();

sub processLine{
  my ($id,$sth)=@_;
  my ($link_acc);

  if ($id =~ m/^gi\|/) {
      # $id of the form: gi|12346|ref|NP_98765.1|<tab>1.1e-50
      $link_acc = (split(/\|/,$id))[3];
      $link_acc =~ s/\.\d+$//;
  }
  elsif ($id =~ m/^UniRef/) {
      $link_acc = $id;
      $link_acc =~ s/^UniRef\d+_//;
  }
  else {$link_acc = $id;}

  my $result = $sth->execute($link_acc);

  while (my ($acc) = $sth->fetchrow_array()) {
    next if ($acc eq $link_acc);
    $acc_uniq{$acc} = $link_acc unless $acc_uniq{$acc};
  }
  $sth->finish();
}