File: dmelhetfeatconv.xml

package info (click to toggle)
libchado-perl 1.31-6
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid, trixie
  • size: 44,716 kB
  • sloc: sql: 282,721; xml: 192,553; perl: 25,524; sh: 102; python: 73; makefile: 57
file content (356 lines) | stat: -rw-r--r-- 11,485 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
<opt
  name="dmelhetfeatconv"
  date="20040821"
  >
 
  <title>Chado DB Feature info</title>
  <about>
    Use this one for D.melanogaster heterochromatin genome db.
    
    These are configurations for converting chado feature table dumps to
    standard feature/sequence files. Much of below specifies how to
    process different features (tied to methods in ChadoFeatDump.pm
    
    These configs should be data-set independent.
    This works with, but is independent of SeqUtil2 configs.
  </about>
  
  <informat>feature_table</informat>  
  
  <outformats>fff</outformats>
  <outformats>gff</outformats>
  <outformats>fasta</outformats>

  <!-- copied from db-release config files .. this is mostly common info,
    but db-release config can override @featset, %featmap -->

  <!--  moved featset/featmap tags to separate config file -->

  <OLD_USAGE_rename_child_type>pseudogene|\w+RNA</OLD_USAGE_rename_child_type>
  <mergematch></mergematch>

  <noforwards_gff>1</noforwards_gff>

  <!-- remapType: append this name pattern to type; works with maptype below -->
  <name2type_pattern>[-_](repeatmasker|genscan|piecegenie|twinscan|genewise|trnascan)</name2type_pattern>

  <!-- dont warn if these types have Parent oid not in data; e.g  chromosome parent -->
  <maptype_ignore_missingparent>^(match_part|match|gene|cytology|chromosome_band|oligo|BAC|protein_binding_site)</maptype_ignore_missingparent>

  <!-- drop lengthy program.source from match name 
  match_sim4_na_EST_complete_dros RE37642.5prime-2L_wgs2cex-na_EST.complete.dros
  also fix these ugly names repeating feature types;
  match:sim4:na_STS.dros  BACN11E12-T7-211000022278175-na_STS.dros-sim4
    cuttype=1
    
    match_part_blastx_aa_SPTR_dros
    
  more cuttype names
  -211000022278591-aa_SP.real.dros-blastx
  match:blastx:aa_SPTR.dros       ID|AAL57609|SPTR|AAL57609-211000022278591-aa_SPTR.dros-blastx
  match:blastx:aa_TR.real.dros    Q967T2-211000022278591-aa_TR.real.dros-blastx
  match:blastx:aa_TR.real.dros    Q94885-211000022279519-aa_TR.real.dros-blastx
  match:blastx:aa_SPTR.dros       ID|AAL90081|SPTR|AAL90081-linked_7-aa_SPTR.dros-blastx  
  match:blastx:aa_SPTR.dros       ID|AAL48487|SPTR|AAL48487-AE003846R_extension-aa_SPTR.dros-blast
  match:groupest:na_DGC.dros      RH25653.3prime_revcomp-AABU01000160-na_DGC.dros-groupest
  
  
  perl -pi.old \
   -e's,\-AABU0\d+\S+dros\S+,,;'   
   -e's,\_extension\S+,,;' \
   -e's,\-linked_\S+,,;' \
   -e's,\-2110000\d+\S+,,;'   
  
  -->
  
  <mapname_pattern name="1match" type="^match.*" cuttype="1" from="null" to="null"/>
  <mapname_pattern name="2match" type="^match.*" from="\-?(2L|2R|3L|3R|4|X)[_\.].*$" to=""/>
  <mapname_pattern name="3match" type="^match.*" from="\_extension\S+$" to=""/>
  <mapname_pattern name="4match" type="^match.*" from="\-linked_\S+$" to=""/>
  <mapname_pattern name="5match" type="^match.*" from="\-2110000\d+\S+$" to=""/>
  <mapname_pattern name="6match" type="^match.*" from=";$" to=""/>

  <mapname_pattern name="cex" from="_wgs3_centromere_extension" to="_wgs2cex"/>
  <mapname_pattern name="dum" from="\-dummy\-" to=""/>
  <mapname_pattern name="tep" type="transposable_element_pred" 
    from="JOSHTRANSPOSON\-" to=""/>

<!--
  dmel r4 ugly match names: all the data appended  ! see above cuttype=1
match:tblastxwrap_masked:na_baylorf1_scfchunk.dpse 
Dpf1glom15-1-AE002603.4_22263978_22335911-na_baylorf1_scfchunk.d 
-->
  <maptype_pattern name="1match" from=":rui.chen" to=""/>
  <maptype_pattern name="2match" from=":Brian.Bettencourt" to=""/>
  <maptype_pattern name="3masked" from="_masked" to=""/>
  <maptype_pattern name="null" typename="noname" from="null" to="null"/> -->


<!--
## SONG/so Revision: 1.45
##     @is_a@oligo ; SO:0000696 ; SOFA:SOFA ; synonym:oligonucleotide
## 'so' is no longer valid
##   old value: @is_a@so ; SO:1000000
##  options are limited: located_sequence_feature, SO:0000110 ??
##  in flybase, 'so' seems used for protein blast matches?
## segment not in this    
## alt choices ...
#      @is_a@assembly ; SO:0000353 ; SOFA:SOFA
# **    @is_a@golden_path ; SO:0000688 ; SOFA:SOFA   <<
# **    @is_a@supercontig ; SO:0000148 ; SOFA:SOFA ; synonym:scaffold    <<
#     @is_a@tiling_path ; SO:0000472 ; SOFA:SOFA
#     @is_a@virtual_sequence ; SO:0000499 ; SOFA:SOFA
#     @is_a@chromosome ; SO:0000340
#     @part_of@chromosome_arm ; SO:0000105

## aug04: add new analysis features (HDP,RNAiHDP,fgenesh,)
## these are like exons but parent feature lacks featureloc 
## - need to join together by object_oid/parent_oid and compute parent feature (has name)
## SO type.subtype should be match.program
## SONG: match, match_part match_set nucleotide_match cross_genome_match cDNA_match EST_match

#? use '.' instead of '_' for part type? would that throw gnomap/gbrowse usage? probably
-->

  <gene_is_complex>0</gene_is_complex>
  <GModelParents>mRNA</GModelParents>
  <GModelParts>protein,CDS,three_prime_UTR,five_prime_UTR</GModelParts>
  <CDS_spanType>protein</CDS_spanType>
  <CDS_exonType>exon</CDS_exonType>

  <!-- rework above mess for gene models; handle pseudogene; ncRNAs ... -->
  <feat_model id="mRNA">
    <parts>exon</parts> 
    <parts>three_prime_UTR</parts>  
    <parts>five_prime_UTR</parts> 
    <submodels>protein</submodels> <!-- link to protein feat_model -->
  </feat_model>
  <feat_model id="protein" 
    typelabel="CDS" 
    parent="mRNA" 
    hasspan="1" makepartsfrom="exon" 
    >
    <parts>CDS</parts> 
  </feat_model>
 
  <!-- flybase chado has these with fmin == 1-origin, others are 0-origin; why?? -->
  <origin_one
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    golden_path="1"
    NOTNOWintron="1"
    transposable_element_insertion_site="1"
    />

  <topsort
    chromosome_arm="1"
    chromosome="1"
    golden_path="1"
    />
    
  <segmentfeats 
    BAC="1"
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    golden_path_region="1"
    golden_path_fragment="1"
    golden_path="1"
    databank_scaffold="1"
    scaffold="1"
    segment="1"
    source="1"
    syntenic_region="1"
    />
  <!-- 
      ## segment no longer valid SO; supercontig or golden_path are best
  -->
  
  <!-- simplefeat == segmentfeats + others -->
  <simplefeat 
    BAC="1"
    chromosome_arm="1"
    chromosome_band="1"
    chromosome="1"
    golden_path_region="1"
    golden_path_fragment="1"
    golden_path="1"
    databank_scaffold="1"
    scaffold="1"
    oligo="1" oligonucleotide="1"
    point_mutation="1"
    region="1"
    repeat_region="1"
    segment="1"
    source="1"
    transcription_start_site="1"
    orthologous_region="1"
    syntenic_region="1"
    />
    

  <!-- skipaskid & segmentfeats  ignored  parent_oid ; dont try to 
    make dubious, maybe huge compound feature -->
  <skipaskid 
    point_mutation="1"
    region="1"
    repeat_region="1"
    pinsertion="1"
    />
    
  <dropname 
    transcription_start_site="1"
    mRNA_genscan="1"
    match_genscan="1"
    mRNA_piecegenie="1"
    match_piecegenie="1"
    tRNA_trnascan="1"
    match_trnascan="1"
    repeat_region="1"
    
    match_blastn_na_dbEST_dpse="1"
    match_blastz="1"
    />
    
    <!-- nameisid include ones with only id => name ; similar to dropid -->
  <nameisid 
    BAC="1"
    chromosome_band="1"
    oligo="1" oligonucleotide="1"
    />

  <dropid 
    BAC="1"
    cDNA_clone="1"
    chromosome_band="1"
    EST="1"
    exon="1"
    intron="1"
    five_prime_UTR="1"
    three_prime_UTR="1"
    oligo="1" oligonucleotide="1"
    processed_transcript="1"
    repeat_region="1"
    transcription_start_site="1"
    transposable_element_pred="1"
    />
    
  <dropfeat_fff 
    CDS_exon="1"
    CDS="1"
    exon="1"
    intron="1"
    remark="1"
    OLD_USAGE_five_prime_UTR="1"
    OLD_USAGE_three_prime_UTR="1"
    />
    
  <dropfeat_gff 
    CDS_exon="1"
    remark="1"
    />

  <hasdups 
    three_prime_UTR="1" 
    intron="1" 
    five_prime_UTR="1" 
    exon="1" 
    
    match_repeat_runner_seg="1"
    match_repeat_runner="1"
    />
    
  <!--   hasdups  match_repeat_runner="1"
    ? drop match_repeat_runner for match_repeat_runner_seg 
    match_repeat_runner="skip"

  jan06: OLD_USAGE_: 
  changed CDS/CDS_exon mapping to GFFv3 usage: CDS == coding_exon not mRNA-like span
  -->
    
    
  <maptype 
    OLD_USAGE_protein="CDS"
    OLD_USAGE_CDS="CDS_exon"
    five_prime_untranslated_region="five_prime_UTR"
    golden_path_region="scaffold"
    oligonucleotide="oligo"
    three_prime_untranslated_region="three_prime_UTR"
    transposable_element_pred="transposable_element_pred"
    
    mRNA_genewise="mRNA:genewise"
    mRNA_twinscan="mRNA:twinscan"
    mRNA_genscan="mRNA:genscan"
    mRNA_piecegenie="mRNA:piecegenie"

    gene_genewise="gene:genewise"
    gene_twinscan="gene:twinscan"
    gene_genscan="gene:genscan"
    gene_piecegenie="gene:piecegenie"
    
    mRNA_trnascan="tRNA:trnascan"
    gene_trnascan="tRNA:trnascan"
    match_tRNAscan_SE="tRNA:trnascan"

    match_clonelocator_scaffoldBACs="BAC"
    match_part_clonelocator_scaffoldBACs="skip"
    
    match_bdgp_unknown_clonelocator_scaffoldBACs="BAC"
    match_part_bdgp_unknown_clonelocator_scaffoldBACs="skip"

    match_promoter="transcription_start_site"
    
    match_locator_cytology="chromosome_band"
    match_part_locator_cytology="skip"

    match_aubrey_cytolocator_cytology="chromosome_band"
    match_part_aubrey_cytolocator_cytology="skip"
     
    match_JOSHTRANSPOSON_Sept="transposable_element_pred"
    match_repeatmasker="repeat_region"
    _repeatmasker="repeat_region"
    match_repeat_runner_seg="repeat_region"

    pinsertion="skip"
    match_sim4_na_affy_oligo_dros="oligo"
    match_dmel_r3_to_dmel_r4_migration_dmel_r3_affy_oligos="oligo"
    
    match_sim4_na_users_i_dros="skip"
    match_part_sim4_na_users_i_dros="skip"
    match_sim4_aa_users_i_dros="skip"
    match_part_sim4_aa_users_i_dros="skip"
    
    match_part_repeat_runner="skip"
    match_part_repeat_runner_seg="skip"
    match_part_promoter="skip"

    match_sim4_na_DGC_in_process_dros="match:sim4:na_DGC_dros"
    match_sim4_na_HDP_RNAi_dmel="match:RNAiHDP"
    match_sim4_na_HDP_mRNA_dmel="match:HDP"
    match_sim4_na_gadfly_dros_RELEASE2="match:sim4:na_gadfly_dmel_r2"
    match_sim4_na_transcript_dmel_RELEASE31="match:sim4:na_transcript_dmel_r31"
    match_sim4_na_transcript_dmel_RELEASE32="match:sim4:na_transcript_dmel_r32"
    match_tblastxwrap_na_baylorf1_scfchunk_dpse="match:tblastx:na_dpse"
    match_tblastxwrap_na_scf_chunk_agambiae_fa="match:tblastx:na_agambiae"

    match_part_sim4_na_DGC_in_process_dros="match_part:sim4:na_DGC_dros"
    match_part_sim4_na_HDP_RNAi_dmel="match_part:RNAiHDP"
    match_part_sim4_na_HDP_mRNA_dmel="match_part:HDP"
    match_part_sim4_na_gadfly_dros_RELEASE2="match_part:sim4:na_gadfly_dmel_r2"
    match_part_sim4_na_transcript_dmel_RELEASE31="match_part:sim4:na_transcript_dmel_r31"
    match_part_sim4_na_transcript_dmel_RELEASE32="match_part:sim4:na_transcript_dmel_r32"
    match_part_tblastxwrap_na_baylorf1_scfchunk_dpse="match_part:tblastx:na_dpse"
    match_part_tblastxwrap_na_scf_chunk_agambiae_fa="match_part:tblastx:na_agambiae"
    />
    
  <!--
  so => "located_sequence_feature", ## leave in for now; no replacement for so ; SO:1000000
  -->
    
  <maptype_gff>
    <transposable_element_pred>transposable_element:predicted</transposable_element_pred>
    <tRNA_trnascan>tRNA:trnascan</tRNA_trnascan>
  </maptype_gff>

</opt>