1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
|
-- reverse_string
CREATE OR REPLACE FUNCTION reverse_string(TEXT) RETURNS TEXT AS
'
DECLARE
reversed_string TEXT;
incoming ALIAS FOR $1;
BEGIN
reversed_string = '''';
FOR i IN REVERSE char_length(incoming)..1 loop
reversed_string = reversed_string || substring(incoming FROM i FOR 1);
END loop;
RETURN reversed_string;
END'
language plpgsql;
-- complements DNA
CREATE OR REPLACE FUNCTION complement_residues(text) RETURNS text AS
'SELECT (translate($1,
''acgtrymkswhbvdnxACGTRYMKSWHBVDNX'',
''tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX''))'
LANGUAGE 'sql';
-- revcomp
CREATE OR REPLACE FUNCTION reverse_complement(TEXT) RETURNS TEXT AS
'SELECT reverse_string(complement_residues($1))'
LANGUAGE 'sql';
-- DNA to AA
CREATE OR REPLACE FUNCTION translate_dna(TEXT,INT) RETURNS TEXT AS
'
DECLARE
dnaseq ALIAS FOR $1;
gcode ALIAS FOR $2;
translation TEXT;
dnaseqlen INT;
codon CHAR(3);
aa CHAR(1);
i INT;
BEGIN
translation = '''';
dnaseqlen = char_length(dnaseq);
i=1;
WHILE i+1 < dnaseqlen loop
codon = substring(dnaseq,i,3);
aa = translate_codon(codon,gcode);
translation = translation || aa;
i = i+3;
END loop;
RETURN translation;
END'
language plpgsql;
-- DNA to AA, default genetic code
CREATE OR REPLACE FUNCTION translate_dna(TEXT) RETURNS TEXT AS
'SELECT translate_dna($1,1)'
LANGUAGE 'sql';
CREATE OR REPLACE FUNCTION translate_codon(TEXT,INT) RETURNS CHAR AS
'SELECT aa FROM genetic_code.gencode_codon_aa WHERE codon=$1 AND gencode_id=$2'
LANGUAGE 'sql';
CREATE OR REPLACE FUNCTION concat_pair (text, text) RETURNS text AS
'SELECT $1 || $2'
LANGUAGE 'sql';
CREATE AGGREGATE concat (
sfunc = concat_pair,
basetype = text,
stype = text,
initcond = ''
);
--function to 'unshare' exons. It looks for exons that have the same fmin
--and fmax and belong to the same gene and only keeps one. The other,
--redundant exons are marked obsolete in the feature table. Nothing
--is done with those features' entries in the featureprop, feature_dbxref,
--feature_pub, or feature_cvterm tables. For the moment, I'm assuming
--that any annotations that they have when this script is run are
--identical to their non-obsoleted doppelgangers. If that's not the case,
--they could be merged via query.
--
--The bulk of this code was contributed by Robin Houston at
--GeneDB/Sanger Centre.
CREATE OR REPLACE FUNCTION share_exons () RETURNS void AS '
DECLARE
BEGIN
/* Generate a table of shared exons */
CREATE temporary TABLE shared_exons AS
SELECT gene.feature_id as gene_feature_id
, gene.uniquename as gene_uniquename
, transcript1.uniquename as transcript1
, exon1.feature_id as exon1_feature_id
, exon1.uniquename as exon1_uniquename
, transcript2.uniquename as transcript2
, exon2.feature_id as exon2_feature_id
, exon2.uniquename as exon2_uniquename
, exon1_loc.fmin /* = exon2_loc.fmin */
, exon1_loc.fmax /* = exon2_loc.fmax */
FROM feature gene
JOIN cvterm gene_type ON gene.type_id = gene_type.cvterm_id
JOIN cv gene_type_cv USING (cv_id)
JOIN feature_relationship gene_transcript1 ON gene.feature_id = gene_transcript1.object_id
JOIN feature transcript1 ON gene_transcript1.subject_id = transcript1.feature_id
JOIN cvterm transcript1_type ON transcript1.type_id = transcript1_type.cvterm_id
JOIN cv transcript1_type_cv ON transcript1_type.cv_id = transcript1_type_cv.cv_id
JOIN feature_relationship transcript1_exon1 ON transcript1_exon1.object_id = transcript1.feature_id
JOIN feature exon1 ON transcript1_exon1.subject_id = exon1.feature_id
JOIN cvterm exon1_type ON exon1.type_id = exon1_type.cvterm_id
JOIN cv exon1_type_cv ON exon1_type.cv_id = exon1_type_cv.cv_id
JOIN featureloc exon1_loc ON exon1_loc.feature_id = exon1.feature_id
JOIN feature_relationship gene_transcript2 ON gene.feature_id = gene_transcript2.object_id
JOIN feature transcript2 ON gene_transcript2.subject_id = transcript2.feature_id
JOIN cvterm transcript2_type ON transcript2.type_id = transcript2_type.cvterm_id
JOIN cv transcript2_type_cv ON transcript2_type.cv_id = transcript2_type_cv.cv_id
JOIN feature_relationship transcript2_exon2 ON transcript2_exon2.object_id = transcript2.feature_id
JOIN feature exon2 ON transcript2_exon2.subject_id = exon2.feature_id
JOIN cvterm exon2_type ON exon2.type_id = exon2_type.cvterm_id
JOIN cv exon2_type_cv ON exon2_type.cv_id = exon2_type_cv.cv_id
JOIN featureloc exon2_loc ON exon2_loc.feature_id = exon2.feature_id
WHERE gene_type_cv.name = ''sequence''
AND gene_type.name = ''gene''
AND transcript1_type_cv.name = ''sequence''
AND transcript1_type.name = ''mRNA''
AND transcript2_type_cv.name = ''sequence''
AND transcript2_type.name = ''mRNA''
AND exon1_type_cv.name = ''sequence''
AND exon1_type.name = ''exon''
AND exon2_type_cv.name = ''sequence''
AND exon2_type.name = ''exon''
AND exon1.feature_id < exon2.feature_id
AND exon1_loc.rank = 0
AND exon2_loc.rank = 0
AND exon1_loc.fmin = exon2_loc.fmin
AND exon1_loc.fmax = exon2_loc.fmax
;
/* Choose one of the shared exons to be the canonical representative.
We pick the one with the smallest feature_id.
*/
CREATE temporary TABLE canonical_exon_representatives AS
SELECT gene_feature_id, min(exon1_feature_id) AS canonical_feature_id, fmin
FROM shared_exons
GROUP BY gene_feature_id,fmin
;
CREATE temporary TABLE exon_replacements AS
SELECT DISTINCT shared_exons.exon2_feature_id AS actual_feature_id
, canonical_exon_representatives.canonical_feature_id
, canonical_exon_representatives.fmin
FROM shared_exons
JOIN canonical_exon_representatives USING (gene_feature_id)
WHERE shared_exons.exon2_feature_id <> canonical_exon_representatives.canonical_feature_id
AND shared_exons.fmin = canonical_exon_representatives.fmin
;
UPDATE feature_relationship
SET subject_id = (
SELECT canonical_feature_id
FROM exon_replacements
WHERE feature_relationship.subject_id = exon_replacements.actual_feature_id)
WHERE subject_id IN (
SELECT actual_feature_id FROM exon_replacements
);
UPDATE feature_relationship
SET object_id = (
SELECT canonical_feature_id
FROM exon_replacements
WHERE feature_relationship.subject_id = exon_replacements.actual_feature_id)
WHERE object_id IN (
SELECT actual_feature_id FROM exon_replacements
);
UPDATE feature
SET is_obsolete = true
WHERE feature_id IN (
SELECT actual_feature_id FROM exon_replacements
);
END;
' LANGUAGE 'plpgsql';
--This is a function to seek out exons of transcripts and orders them,
--using feature_relationship.rank, in "transcript order" numbering
--from 0, taking strand into account. It will not touch transcripts that
--already have their exons ordered (in case they have a non-obvious
--ordering due to trans splicing). It takes as an argument the
--feature.type_id of the parent transcript type (typically, mRNA, although
--non coding transcript types should work too).
CREATE OR REPLACE FUNCTION order_exons (integer) RETURNS void AS '
DECLARE
parent_type ALIAS FOR $1;
exon_id int;
part_of int;
exon_type int;
strand int;
arow RECORD;
order_by varchar;
rowcount int;
exon_count int;
ordered_exons int;
transcript_id int;
transcript_row feature%ROWTYPE;
BEGIN
SELECT INTO part_of cvterm_id FROM cvterm WHERE name=''part_of''
AND cv_id IN (SELECT cv_id FROM cv WHERE name=''relationship'');
--SELECT INTO exon_type cvterm_id FROM cvterm WHERE name=''exon''
-- AND cv_id IN (SELECT cv_id FROM cv WHERE name=''sequence'');
--RAISE NOTICE ''part_of %, exon %'',part_of,exon_type;
FOR transcript_row IN
SELECT * FROM feature WHERE type_id = parent_type
LOOP
transcript_id = transcript_row.feature_id;
SELECT INTO rowcount count(*) FROM feature_relationship
WHERE object_id = transcript_id
AND rank = 0;
--Dont modify this transcript if there are already numbered exons or
--if there is only one exon
IF rowcount = 1 THEN
--RAISE NOTICE ''skipping transcript %, row count %'',transcript_id,rowcount;
CONTINUE;
END IF;
--need to reverse the order if the strand is negative
SELECT INTO strand strand FROM featureloc WHERE feature_id=transcript_id;
IF strand > 0 THEN
order_by = ''fl.fmin'';
ELSE
order_by = ''fl.fmax desc'';
END IF;
exon_count = 0;
FOR arow IN EXECUTE
''SELECT fr.*, fl.fmin, fl.fmax
FROM feature_relationship fr, featureloc fl
WHERE fr.object_id = ''||transcript_id||''
AND fr.subject_id = fl.feature_id
AND fr.type_id = ''||part_of||''
ORDER BY ''||order_by
LOOP
--number the exons for a given transcript
UPDATE feature_relationship
SET rank = exon_count
WHERE feature_relationship_id = arow.feature_relationship_id;
exon_count = exon_count + 1;
END LOOP;
END LOOP;
END;
' LANGUAGE 'plpgsql';
|