File: chadofeatsql.xml

package info (click to toggle)
libchado-perl 1.31-6
links: PTS, VCS
area: main
in suites: bookworm, bullseye, sid, trixie
size: 44,716 kB
sloc: sql: 282,721; xml: 192,553; perl: 25,524; sh: 102; python: 73; makefile: 57
file content (1209 lines) | stat: -rw-r--r-- 41,665 bytes
parent folder | download | duplicates (5)
<opt
  name="chadofeatsql" 
  date="20071014"
  >

  <title>Chado DB SQL</title>
  
  <about id="chadofeatsql"><![CDATA[

    This is a collection of Chado DB SQL calls to extract
    all basic genome features, into intermediate feature_table form
    
  arm     fmin    fmax    strand  type    name    id      oid     attr_type       attribute
  2L      0       305900  1       golden_path_region      AE003590        AE003590        1273141 dbxref   Gadfly:AE003590
  2L      6364    6366    1       transcription_start_site        6365-6366-AE999999.Fake-dummy-promoter   6365-6366-AE999999.Fake-dummy-promoter  1273564         
  2L      6773    6808    1       exon            NULL:960558     1273721 parent_oid      1273720:1
  2L      6773    9276            mRNA    6773,9276-AE999999.Fake-dummy-piecegenie        NULL:9605
  
    There is a tag feature_sql, type="feature_table" for each group of
    features needing separate SQL: chromosome/super-contigs,
    gene_models, matches, analyses, syntenic features ..
    
    There are three critical ENV_default configurations defined here, 
    that can be overridden in your site_defaults.xml or main bulkfiles.xml 
    configuration:
    
       golden_path="'chromosome','chromosome_arm','ultra_scaffold'"
         .. the top level source feature(s) for your genome data

       seq_ontology="sequence"
         .. the name in Chado cv table that refers to cvterm table sequence terms
         .. "Sequence Ontology Feature Annotation",
             SOFA, SO, Sequence Ontology, sequence, and other values have been used.
         
       species="Unknown_species"

    Find worked examples from this configuration file now at
    http://www.gmod.org/Sample_Chado_SQL
       
    Logic in Bulkfiles::FeatureWriter then merges, selects/reformats
    these feature_tables and writes bulk files per chromosome.  One can
    also attach post-processing scripts (see matches below).
    
    Written originally for the FlyBase  project, it attempts to work
    for a range of Chado databases.
  ]]></about>




  <ENV_default
    seq_ontology="sequence"
    seq_ontology_old="Sequence Ontology Feature Annotation"
    golden_path="'chromosome', 'chromosome_arm', 'ultra_scaffold', 'supercontig'"
    species="Unknown_species"
    org="unk"
    featureprops="'Note','cyto_range','gbunit'"
    unknown_chr="U"
    analysis_where_clause=""
    est_where_clause=""
    release_id="0"
    release_date="0"
    GFF_source="MOD" 
    taxon="0"
  />
  <about id="ENV_default"><![CDATA[

  Variables above of form '$ {xxx}' for XML::Simple are handy, 
  however things fail if (a) they are not defined by caller
  or (b) defined badly. Need default config vars (XML::Simple doesn't do )
  
  seq_ontology is name in CV table (SO, Sequence Ontology, etc.)
  golden_path value is set of quoted names for chromosome/.. type
  'chromosome', 'chromosome_arm', 'ultra_scaffold' ..
  feature.type.name IN ($ {golden_path})
  
  2007oct
    : change default seq_ontology="sequence" to match change in GMOD onto. loading
    : add 'supercontig', synonym of scaffold, to default golden_path
           
    : example clauses on how to segregate analysis types        
    analysis_where_clause="AND NOT (an.program = 'sim4' AND an.sourcename IN ('na_dbEST'))"
    est_where_clause="AND (an.program = 'sim4' AND an.sourcename IN ('na_dbEST'))"

  ]]></about>



  <feature_sql id="organism_summary" type="summary" output="organisms-overview.txt">
  <sql><![CDATA[
    SELECT o.organism_id,o.abbreviation,o.genus,o.species,o.common_name, 
           count(f.feature_id) as n_features, o.comment
    FROM organism o LEFT JOIN feature f USING (organism_id)
    GROUP by o.organism_id,o.abbreviation,o.genus,o.species,o.common_name,o.comment
    ORDER BY o.genus,o.species
    ;
  ]]></sql>
  
  <!-- // drop, header fixed from above
  <script name="orgsum" type="postprocess" shell="perl -ni" language="perl">
  <![CDATA[
  unless($header++){ print join("\t",
   qw(Organism_id Abbreviation Genus Species Common_name N_features Comment)),"\n"; } 
  @v=split"\t"; foreach (@v) { $_="\\N" unless(/\S/); } chomp($v[-1]);
  print join("\t",@v),"\n";
  ]]></script>
  -->
  
  </feature_sql>


  <feature_sql id="analysis_summary" type="summary" output="analyses-overview.txt">
  <sql><![CDATA[
  SELECT 
    an.analysis_id as Analysis_id,
    CASE WHEN (an.sourcename IS NULL OR an.sourcename = 'dummy') THEN 'match:' || an.program
      ELSE 'match:' || an.program || ':' || an.sourcename
    END AS Analysis_type, 
    count(f.feature_id) as N_features,
    ROUND( (AVG(af.rawscore)::numeric), 2 ) as Ave_score,
    ROUND( (AVG(af.significance)::numeric),  2 ) as Ave_sig,
    (select genus || '_' || species from organism where organism_id = f.organism_id) as Species
  FROM feature f, analysisfeature af, analysis an
  WHERE an.analysis_id = af.analysis_id and af.feature_id = f.feature_id
  GROUP BY f.organism_id, an.analysis_id, Analysis_type  
  ORDER BY species, Analysis_type
  ;
  ]]></sql>  

  </feature_sql>

  <feature_sql id="property_summary" type="summary" output="properties-overview.txt">
  <sql><![CDATA[
  SELECT 
    fp.type_id,
    (select name from cvterm where cvterm_id = fp.type_id) as Property_type,
    count(fp.featureprop_id) as N_properties,
    count(distinct f.feature_id) as N_features,
    count(distinct fp.value) as N_values,
    (select genus || '_' || species from organism where organism_id = f.organism_id) as Species
  FROM feature f join featureprop fp on (fp.feature_id = f.feature_id)
  GROUP BY f.organism_id, fp.type_id
  ORDER BY Species, Property_type
  ;
  ]]></sql>  

  </feature_sql>


  <feature_sql id="feature_summary" type="summary" output="features-overview.txt">
  <sql><![CDATA[
  SELECT 
    f.type_id,
    (select name from cvterm where cvterm_id = f.type_id) as Feature_type,
    count(f.feature_id) as N_features,
    sum(length(f.residues)) as N_residues,
    sum(f.seqlen) as Tot_len,
    ROUND( AVG(f.seqlen), 0 ) as Ave_len,
    MIN(f.seqlen) as Min_len,
    MAX(f.seqlen) as Max_len,
    (select genus || '_' || species from organism where organism_id = f.organism_id) as Species
  FROM feature f  
  GROUP BY f.organism_id, f.type_id 
  ORDER BY Species, Feature_type
  ;
  ]]></sql>  

  </feature_sql>


  <feature_sql id="chromosome_summary" type="NOTYET_summary" output="chromosomes-overview.txt">
  <about>
  This chromosome_summary needs more testing: 
  works well enough on some dbs, but hung up on a largish one.
  Moved cvterm.name from where clause to subselect after reducing features.. 5x cut in grouping cost 
  </about>
  <sql>
    SELECT  
      f.type_id as Type_id,  
      (select name from cvterm where cvterm_id = f.type_id) as Golden_path_type,
      count(DISTINCT f.feature_id) as N_features, 
      sum(f.seqlen) as Tot_len, 
      sum(length(f.residues)) as N_residues,
      (select genus || '_' || species from organism where organism_id = f.organism_id) as Species
    FROM  feature f join featureloc fl on (fl.srcfeature_id = f.feature_id)
    GROUP BY  f.organism_id, f.type_id
    ORDER BY Species, Golden_path_type
    ;
  </sql>
  </feature_sql>
 
 
  <!-- // want something like this to show tables with >0 rows ...
  
  try this for row counts == reltuples:
  select relname,reltype,relpages,reltuples,relkind from pg_class 
    where relkind = 'r' and relnamespace = 2200 order by reltuples DESC ;

  <feature_sql id="db_summary" type="summary" output="database-overview.txt">
  <sql><![CDATA[
  SELECT s.tablename, count(s.tablename.*) as row_count
  from pg_tables s
  where s.schemaname = 'public'
  ;
  ]]></sql>  
  -->


  <!-- 0710: add SO/SOFA CV table identity check -->
  
  <feature_sql id="0_seq_ontology_check" type="validate" >
  <warning>
Failed to validate configuration variable 
    seq_ontology='${seq_ontology}'
in database tables cv, cvterm and feature. Please check your configuration.
  </warning>
  
  <sql><![CDATA[
    SELECT feat.feature_id
    FROM   feature feat, cvterm featcv, cv socv
    WHERE  socv.name = '${seq_ontology}'
       and featcv.cv_id = socv.cv_id 
       and featcv.cvterm_id = feat.type_id
    LIMIT 1;  
  ]]></sql>
  
  <sql_inspect><![CDATA[
    SELECT name as seq_ontology_name, cv_id FROM cv 
    WHERE cv_id IN (SELECT cv_id FROM cvterm WHERE name = 'exon');
  ]]></sql_inspect>
  </feature_sql>

  <feature_sql id="1_golden_path_check" type="validate">
  <warning>
Failed to validate configuration variable 
    golden_path='${golden_path}'
in database tables cv, cvterm and feature. Please check your configuration.
  </warning>

  <sql><![CDATA[
    SELECT feat.feature_id -- ? or uniquename
    FROM   feature feat, cvterm featcv, cv socv
    WHERE  featcv.name in ( ${golden_path} )    
       and featcv.cv_id = socv.cv_id 
       and socv.name = '${seq_ontology}'
       and featcv.cvterm_id = feat.type_id
    LIMIT 1; 
  ]]></sql>

  <sql_inspect><![CDATA[
    SELECT
      f.type_id,  
      (select name from cvterm where cvterm_id = f.type_id) as golden_path_type,
      count(DISTINCT f.feature_id) as N_features, 
      sum(f.seqlen) as Tot_len, sum(length(f.residues)) as N_residues
    FROM  feature f join featureloc fl on (fl.srcfeature_id = f.feature_id)
    GROUP BY f.type_id
    ;
  ]]></sql_inspect>
  </feature_sql>



  <feature_sql id="go_association" type="list" 
               output="gene_association.${GFF_source}_${org}.txt">
  <about>
  0710: preliminary to produce gene ontology association tables
  .. need variables for GO CV tables // ok: see awkward sql below
  .. need taxon code in organism.table and/or main config
  .. want filename with ${GFF_source}_${org} suffix ?
  .. add header for GO submission ..
  </about>

  <script name="goa_header" type="postprocess" shell="perl -pi" language="perl">
  <![CDATA[
unless($header++){ print "
!Version:  ${release_id}
!Organism: ${species}
!date: ${release_date}
!From: ${GFF_source}
";}
  ]]></script>

  <sql><![CDATA[
 SELECT
    '${GFF_source}' as DB,
    f.uniquename as ID, 
    f.name as Symbol, -- from synonyms table instead?
    '' as Qualifier,
    'GO:'||(select accession from dbxref where dbxref_id = c.dbxref_id) as GO_ID,
      -- sometimes chado dbxref has 'GO:' prefix, sometimes not. GO spec says 'GO:' is part
      -- of accession and *should* be stored as dbxref.accession
    '${GFF_source}:'||pubx.uniquename as Reference,
    'IEA' as EV_code, -- fixme, where in chado? 
    '' as With, 
    CASE   WHEN c.cv_id = (select cv_id from cv where name ilike '%biological%process%')
           THEN 'P'
           WHEN c.cv_id = (select cv_id from cv where name ilike '%molecular%function%')
           THEN 'F'
           WHEN c.cv_id = (select cv_id from cv where name ilike '%cellular%component%')
           THEN 'C'
           ELSE 'X'
    END as Aspect,
    '' as Name, -- fixme
    '' as Synonyms, -- fixme
    (select name from cvterm where cvterm_id = f.type_id) as Type,
    'taxon:${taxon}' as Taxon,  
    replace(substr(text(f.timelastmodified),1,10),'-','') as Date,  
    '${GFF_source}' as Assigned
    
  FROM cv socv, feature_cvterm fc 
    join cvterm c on (c.cvterm_id = fc.cvterm_id)
    join feature f on (f.feature_id = fc.feature_id)
    join pub pubx on (pubx.pub_id = fc.pub_id)  
  WHERE 
  (    c.cv_id = (select cv_id from cv where name ilike '%biological%process%')
    or c.cv_id = (select cv_id from cv where name ilike '%molecular%function%')
    or c.cv_id = (select cv_id from cv where name ilike '%cellular%component%')
  ) and  f.type_id = (select cvterm_id from cvterm where name = 'gene' 
          and cv_id = socv.cv_id and socv.name = '${seq_ontology}')
    -- limit it to gene features
  ;
  ]]></sql>
  </feature_sql>


  <feature_sql id="fbids" type="SKIP" output="fbid_table1.txt">
  <sql><![CDATA[
    -- used for flybase to exclude cytologic features with same genome feature id
    SELECT accession FROM dbxref 
    WHERE accession like '%FBgn%' or accession like '%FBti%'
    ;
  ]]></sql>
  </feature_sql>

  <feature_sql id="fbid_table" type="SKIP" output="fbid_table.txt">
  <sql>;</sql>
  
  <script name="fbidtab" input="-" type="postprocess"  language="perl"
    shell="cat fff/*.fff |  egrep '\b(gene|transposable_element)\b' | grep '\bFB' | perl " notshell="perl ">
<![CDATA[
# fbidtab.pl
# use: cat fff/*.fff | egrep '\b(gene|transposable_element)\b' | grep '\bFB' | perl -n fbidtab.pl | sort > fbid_table.txt
  BEGIN{
  print qq{#   Database ID table for primary gene model features
#  gene,ncRNA,pseudogene,rRNA,snRNA,snoRNA,tRNA,transposable_element,
#  transposable_element_insertion_site
# ANID = Annotation ID ;  FBID = primary FlyBase ID ; SYM = gene symbol ; ID2 = secondary IDs
}, join"\t",qw(ANID FBID SYM ID2),"\n";
  }
  ($a,$b,$ft,$sm,$cm,$r,$id,$dx,$nt)=split"\t"; 
  @gnid=(); @gn2=();
  push(@gnid, ($dx=~/(FBgn\d+)/g)); 
  push(@gnid, ($dx=~/(FBti\d+)/g)); 
  unshift(@gnid,$id) if (!@gnid && $id =~ /FB\w\w\d+/);
  $gnid= shift @gnid;
  ($anid2)=($dx=~/(FBan\d+)/); 
  push(@gnid,$anid2) if($anid2);
  push(@gn2, ($nt=~/synonym_2nd=(FB\w\w\d+)/g));  #?? getting prime id ?
  push(@gn2, ($nt=~/synonym_2nd=(C[GR]\d+)/g));   
  push(@gn2, ($nt=~/synonym_2nd=(G\w\d+)/g));   #dpse, others?
  foreach $gn2 (@gn2) { push(@gnid,$gn2) unless(grep /$gn2/,@gnid); }
  $gnid2= join(",",@gnid);
  print join("\t",$id,$gnid,$sm,$gnid2),"\n";
]]></script>
  </feature_sql>


  <feature_sql id="ortho_table" type="list" output="ortho_table.txt">
  <sql>;</sql>

  <script name="orthotab" input="-" type="postprocess"  language="perl"
    shell="cat tmp/featdump/features.tsv tmp/featdump/synteny.tsv | egrep 'putative_ortholog_of|orthologous_region' | perl " notshell="perl ">
<![CDATA[
# orthotab3.pl
# egrep -h 'putative_ortholog_of|orthologous_region' $dp/tmp/featdump/features.tsv $dp/tmp/featdump/synteny.tsv | perl -n orthotab.pl
BEGIN { 
 %po=(); %sppmap= ('1' => 'Dmel', '3' => 'Dpse' ); 
 print qq{#
#  Ortholog table for Drosophila species 
# OrthoID = orthology ID and protein annotation ID for each orthology pair
}, join"\t",qw(OrthoID DpseID DmelID Dmel_Symbol),"\n";
}
chomp; 
($a,$b,$e,$s,$org,$t,$n,$id,$oid,$atype,$att)=split"\t";
if ($atype eq 'putative_ortholog_of') {
  $att =~ s/FlyBase://;
  $po{$att}= $id;
} else {
if($atype eq 'to_species') { $to_species= $att; }
if($atype eq 'to_name') { $to_name= $att; }
if($l_oid == $oid && $org == 3) { # paired to_ attr per oid; !to_dpse lack to_name !???
  $spp = $sppmap{$org};
  $spp2= $sppmap{$to_species};
  $ga= ($to_name =~ /(C[GR]\d+)/) ? $po{$1} : '';
  @to_name= reverse split(/,/,$to_name); # name,ID -> ID,name
  $id =~ s/ortho://; 
  print join("\t",$id,$ga,@to_name),"\n";
  #print join("\t",$id,"$spp:$ga","$spp2:$to_name"),"\n";
}
$l_oid= $oid; 
}
]]></script>
  </feature_sql>


  <feature_sql id="chromosomes_dpse" type="feature_table" output="chromosomes.tsv">

  <sql><![CDATA[
-- get chromosome/arm/supercontig/ ; mess needed for dpse with mixture of
-- assembly-ness
CREATE TEMP TABLE chromosomes AS
SELECT 
  armft.uniquename as arm,
  1 as fmin, armft.seqlen as fmax, 0 as strand,
  armft.organism_id,
  featcv.name as type,
  armft.name as name,
  armft.uniquename as id,
  armft.feature_id as foid,
  'species' as attr_type, 
  org.genus || '_' || org.species AS attribute
FROM 
  feature armft, organism org, cvterm featcv, cv socv
WHERE 
  featcv.name in ( 'chromosome_arm', 'ultra_scaffold' )      
  and featcv.cv_id = socv.cv_id and socv.name = '${seq_ontology}'
  and armft.organism_id = org.organism_id
  and armft.type_id = featcv.cvterm_id
  and armft.seqlen IS NOT NULL  -- ? want this
  and NOT (armft.uniquename like 'Unknown%') -- these should not be ultrascafs
;

INSERT INTO chromosomes 
SELECT 
  armft.uniquename as arm,
  1 as fmin, fctg.seqlen as fmax, fr.rank as strand,  
  armft.organism_id,
  CASE WHEN (armft.uniquename = 'U') -- fctg.uniquename like 'Unknown%'
    THEN 'golden_path_region' 
    ELSE ctgcv.name
    END AS type,
  fctg.name as name,fctg.uniquename as id,
  fctg.feature_id as foid,
  'species' as attr_type, 
  org.genus || '_' || org.species AS attribute
FROM 
  feature armft, feature fctg, feature_relationship fr,
  cvterm featcv, cvterm ctgcv, cv socv, organism org
WHERE 
  (
   (featcv.name = 'chromosome_arm' AND armft.uniquename = 'U')
    --^ this needed for U-Unknowns
    -- dont want other arms here with XR_group, 4_group parts 
  OR
   ( featcv.name = 'ultra_scaffold' ) 
    -- ^ this gets groupMISC parts, with rank order
  )
  and featcv.cv_id = socv.cv_id and socv.name = '${seq_ontology}'
  and fr.object_id = armft.feature_id
  and fr.subject_id = fctg.feature_id
  and armft.type_id = featcv.cvterm_id  
  and armft.seqlen IS NULL 
  and armft.organism_id = org.organism_id
  and fctg.type_id = ctgcv.cvterm_id  
;

SELECT * from chromosomes
ORDER BY organism_id, arm, strand, id
;

-- DROP TABLE chromosomes;

]]>
  </sql>
  
  <script name="remapUandMISC" type="postprocess" shell="perl -i.old" language="perl">
<![CDATA[

BEGIN { %armpart=(); $spp= ${species} || 'Drosophila_pseudoobscura'; }  
while(<>) { 
  if (/^arm\t/) { print $_; next; } #header
  elsif (/\(\d+ rows\)/) { next; } #tail
  my @v= split "\t"; # [0] == arm; [-1] == species; [5]== type; [7] == uniquename
  
  if ($v[-1] !~ /$spp/) { print join("\t",@v); }
  else {  
  if ($v[5] =~ /^golden_path_region$/) { 
    $arm = $v[0]; ## $v[0] is correct arm name now, not Unknown...
    $rank= $v[3]; # == strand field
    if ($rank) {
      $rank= sprintf("%08d",$rank);  
      } 
    elsif ($v[7] =~ /^(Unknown\D*)(\d*)$/) {
      my ($t,$n)= ($1,$2); $n= 0 unless($n);
      $rank= sprintf("$t%08d",$n);  # make sortable
      }
    else {
      $rank= 'zzz';
      }
    $v[0] = $rank; # for sort
    $armpart{$arm}= [] unless $armpart{$arm};
    push( @{$armpart{$arm}}, \@v);
    }
  else { 
    if ($v[0] =~ /group/) { $v[5] = 'golden_path';  } # was ultra_scaffold
    else { $v[5] = 'chromosome_arm'; } # was ultra_scaffold
    print join("\t",@v);
    }
  }  
   
} 
END { mapparts(); }

sub mapparts {
  foreach my $arm (sort keys %armpart) {
  @r= @{$armpart{$arm}};
  if (@r) { # need hash per chr now !
    open(ARGVOUT, ">>$ARGV"); select(ARGVOUT); # perl closes before this
    @r= sort{ $$a[0] cmp $$b[0] } @r; # sort by rank
    my ($totlen,$orgid,$rank, $spp)= (0,0,0,'unknown');
    foreach my $r (@r) {
      $$r[0] = $arm;  ## $$r[0] =  $Uchr; 
      $$r[1] += $totlen;
      $$r[2] += $totlen;
      $rank++; $$r[3]= $rank unless($$r[3]);
      $orgid = $$r[4];
      $spp   = $$r[-1];
      $$r[5] = 'golden_path_fragment'; # need new type for part of unknown...
      $totlen= $$r[2];
      }
    chomp($spp);
    print join("\t", $arm, 1, $totlen, 0, $orgid, "golden_path", 0, "species",$spp), "\n";
    foreach my $r (@r) { print join("\t",@$r); } ## put fragments at end 
    }
  @r=(); 
  }
} 

]]>
  </script>
  
  </feature_sql>
  
  
  
  
  <feature_sql id="chromosomes" type="feature_table" output="chromosomes.tsv">
  <sql><![CDATA[
-- get chromosome/arm/supercontig/... lengths/names
-- need ENV/other subsitution Variable for 'chromosome', 'chromosome_arm', 'ultra_scaffold'
-- add get-residues to file 
-- made chr == uniqname nov04; new dmel4r uses name == genbank accession
SELECT 
  armft.uniquename as arm,
  '1' as fmin, armft.seqlen as fmax, '0' as strand,
  armft.organism_id,
  featcv.name as type,
  armft.name as name,
  armft.uniquename as id,
  armft.feature_id as oid,
  'species' as attr_type, 
  org.genus || '_' || org.species AS attribute
FROM 
  feature armft, organism org, cvterm featcv, cv socv
WHERE 
  featcv.name in ( ${golden_path} )    
  and featcv.cv_id = socv.cv_id and socv.name = '${seq_ontology}'
  and armft.organism_id = org.organism_id
  and armft.type_id = featcv.cvterm_id
ORDER BY arm ;
]]></sql>
    
  <script name="remapunk" type="postprocess" shell="perl -i.old" language="perl">
<![CDATA[
# remap dpse Unknown ultra_scaffold singletons and groups to one bag U chromosome
# need to impose artificial ordering; rewrite start/stop locations after sort by name
# species set in release.xml as ENV variable ; Drosophila_pseudoobscura
# mar05: need to handle dpse 4MISC link Unknown; is bag chr with ordered Contig_contigs
# but db lacks featurelocs assigning these to 4MISC locations.  Need to use featrel rank
## for ordering : rename Contig_contig to 4MISC:1, 4MISC:2 ... ?

BEGIN { 
  @r=(); $Uchr= "${unknown_chr}" || "U"; 
  @gpaths= (${golden_path}); # may be several, should be quoted
  $gpath= shift @gpaths || "chromosome";  $gpath =~ s/['"]//g;
  }  
while(<>) { 
  if (/^arm\t/) { print $_; next; } #header
  my @v= split "\t"; # [0] == arm; [-1] == species
  if ($v[-1] =~ /${species}/ && $v[0] =~ /^(Unknown\D*)(\d*)$/) { 
    my ($t,$n)= ($1,$2); $n= 0 unless($n);
    $v[0]= sprintf("$t%08d",$n);  # make sortable
    push(@r, \@v); 
    }
  else { 
    if ($v[-1] =~ /${species}/ && $v[0] =~ /group/) {
      $v[5] = 'golden_path'; 
      }
    elsif ($v[-1] =~ /${species}/) {
      $v[5] = $gpath; 
      }
    print join("\t",@v);
    }
 } 
END { mapunk(); }

sub mapunk {
  if (@r) {
    open(ARGVOUT, ">>$ARGV"); select(ARGVOUT); # perl closes before this
    @r= sort{ $$a[0] cmp $$b[0] } @r; # sort by arm name
    my ($cmin,$orgid, $spp)= (0,0,'unknown');
    foreach my $r (@r) {
      $$r[0] =  $Uchr; # or "U" ?
      $$r[1] += $cmin;
      $$r[2] += $cmin;
      $orgid = $$r[4];
      $spp   = $$r[-1];
      $$r[5] = 'golden_path_fragment'; # need new type for part of unknown...
      $cmin  = $$r[2];
      }
    print join("\t", $Uchr, 1, $cmin, 0, $orgid, "golden_path", 0, "species",$spp), "\n";
    foreach my $r (@r) { print join("\t",@$r); } ## put fragments at end 
    }
  @r=(); 
} 

]]>
  </script>

  </feature_sql>

  
  <feature_sql id="features" type="feature_table" output="features.tsv">
  <sql><![CDATA[
-- standard features (excluding matches, analyses)
SELECT DISTINCT
  armft.uniquename as arm,
  armloc.fmin, armloc.fmax, armloc.strand,
  armft.organism_id,
  targcv.name as type,
  targft.name as name,
  targft.uniquename as id,
  targft.feature_id as oid,
  attr.type as attr_type, attr.attribute

FROM
  feature armft, 
  feature targft left outer join gffattr_gmodel attr
    on (targft.feature_id = attr.feature_id),
  featureloc armloc, 
  cvterm targcv

WHERE
  NOT (targcv.name in ('match', 'syntenic_region', 'orthologous_region')) 
  and targft.type_id = targcv.cvterm_id 
  and armft.feature_id = armloc.srcfeature_id  
  and targft.feature_id = armloc.feature_id  
  
-- dont need ORDER clause? but see below
-- 0710: memory error here w/ one chado: parameciumdb, including simple psql -f sql call
--       drop this order-by and do with perl?
-- ORDER BY arm, armloc.fmin  
;

  ]]></sql>
  </feature_sql>
  

  <feature_sql id="synteny" type="feature_table"  output="synteny.tsv">

  <!-- 
  add this to make tables of synt/org  for  gbrowse maps

  egrep 'to_species.1' synteny.tsv | perl -ne \
'($c,$b,$e,$t,$og,$ft,$nm,$id,$xx)=split"\t";print"$id\t$c\t$b\t$e\n";' \
> dpse-synt-idmap.tsv
egrep 'to_species.3' synteny.tsv | perl -ne \
'($c,$b,$e,$t,$og,$ft,$nm,$id,$xx)=split"\t";print"$id\t$c\t$b\t$e\n";' \
> dmel-synt-idmap.tsv

   -->
   
  <script name="orthofix" type="FIXME_postprocess" shell="perl -i.old" language="perl">
<![CDATA[
# orthofix.pl 
# patch for dpse chado with missing ortho to_names
# needs data from features.tsv 
# could do as sql select: ortho featureloc -> gene featureloc -> featprop putative_ortho
# egrep -h 'putative_ortholog_of|orthologous_region|syntenic_region' features.tsv synteny.tsv | perl -n orthofix.pl
BEGIN { 
 %po=(); %sppmap= ('1' => 'Dmel', '3' => 'Dpse' ); 
}
chomp; 
($a,$b,$e,$s,$org,$t,$n,$id,$oid,$atype,$att)=split"\t";
$loc="$a:$b-$e";
if ($atype eq 'putative_ortholog_of') {
  $att =~ s/FlyBase://;
  $po{$loc}= $att;
} elsif ($org == 3 && $t eq 'orthologous_region') {
  # row order in synt.tsv is to_name > to_species
  if($atype eq 'to_species') { $to_species= $att; }
  elsif($atype eq 'to_name') { $to_name= $att; }
  if($to_species == 1) { # no to_name case; $l_oid != $oid && 
    unless($to_name) { $to_name= $po{$loc}; }
    if ($to_name) {
      print join("\t",($a,$b,$e,$s,$org,$t,$n,$id,$oid,'to_name',$to_name)),"\n" ;
      print join("\t",($a,$b,$e,$s,$org,$t,$n,$id,$oid,'to_species',$to_species)),"\n" ;
    } else { $nmiss++; }
    $to_name= $to_species= '';
  }
} else {
  print "$_\n"; # for rest of synt.tsv
  $to_name= $to_species= '';
}
$l_oid= $oid; 
END{print STDERR "missing ortho names=$nmiss\n"; }
]]>
  </script>
   
  <sql><![CDATA[
  -- dpse-synteny.sql for cross-species features (syntenic_region, orthologous_region)
  -- split output by species ? 
  -- this feature_table has extra column for species,to_species, no strand
  -- need other type="synteny_table"? or  use attr: species2=Dpse  ?
  
SELECT distinct
  armft.uniquename as arm,
  armloc.fmin, armloc.fmax, armloc.strand, 
  armft.organism_id,
  matchcv.name as type,
  matchft.name as name,
  matchft.uniquename as id,
  matchft.feature_id as oid,
  attr.type AS attr_type,
  attr.attribute AS attribute

FROM
  feature matchft, 
  feature armft,
  feature targft left outer join gffattr_synt2 attr 
    on (targft.feature_id = attr.feature_id),
  featureloc armloc, featureloc targloc,
  cvterm matchcv, cvterm armcv, cv socv

WHERE
  matchcv.name in ( 'syntenic_region', 'orthologous_region' )
  and matchcv.cvterm_id = matchft.type_id 

  and armcv.name in ( ${golden_path} ) 
  and armcv.cv_id = socv.cv_id and socv.name = '${seq_ontology}'
  and armcv.cvterm_id = armft.type_id  
  and armft.organism_id != targft.organism_id
    
  -- separate paired features here - keep chr-arm loc, target types/attr
  and matchft.feature_id = armloc.feature_id  
  and armft.feature_id = armloc.srcfeature_id
  and matchft.feature_id = targloc.feature_id  
  and targft.feature_id = targloc.srcfeature_id 

-- DO need ORDER clause
ORDER BY arm, armloc.fmin --, matchcv.name
;
  ]]></sql>
  </feature_sql>



  <feature_sql id="matches" type="feature_table"  output="matches.tsv">

  <!-- see below ests target; better than this -->
  <script name="matchfilt" type="SKIP_postprocess" shell="perl -ni.old" language="perl">
<![CDATA[
  #filter matches that exist in analysis set; keep EST/cDNAs matches; RE60666.5prime, LD02891.5prime and like
  print if (/([.:])(5prime|3prime|contig)/ && !/sim4:wrap/);
]]>
  </script>

  <sql><![CDATA[
-- use this to get the paired genome/target match items w/ attr (or null) --
-- jun04 - need to filter out apollo dupl. evidence for
-- match features, type = transposable_element_insertion_site
-- jan05 ; there are dmel chado features here not in analysis; but need to filter
-- those found in analysis set from here. Problem is finding shared OID !
-- change 'targft left outer join gffattr_match ' to 'matchft ..' ?

SELECT distinct
  armft.uniquename as arm,
  armloc.fmin, armloc.fmax, armloc.strand,
  armft.organism_id,
  targcv.name as type, 
  targft.name as name,
  targft.uniquename as id,
  targft.feature_id as oid,
  attr.type as attr_type, attr.attribute
 
FROM
  feature matchft, feature armft, 
  feature targft left outer join gffattr_match attr
     on (targft.feature_id = attr.feature_id),
  featureloc armloc, featureloc targloc,
  cvterm targcv, cvterm matchcv, cvterm armcv, cv socv
   
   -- tested speed of this w/ cv.names versus cvterm_id number inserted
   -- pg is smart enough to optimize cv name lookup once - leave as is
WHERE
  matchcv.name = 'match' 
  and matchcv.cvterm_id = matchft.type_id 
  and armcv.name in ( ${golden_path} )  
  and armcv.cv_id = socv.cv_id and socv.name = '${seq_ontology}'
  and armcv.cvterm_id = armft.type_id  

  and targft.type_id = targcv.cvterm_id 
  and targft.feature_id != armft.feature_id  
  
  -- separate paired features here - keep chr-arm loc, target types/attr
  and matchft.feature_id = armloc.feature_id  
  and armft.feature_id = armloc.srcfeature_id
    
  and matchft.feature_id = targloc.feature_id  
  and targft.feature_id = targloc.srcfeature_id 
   
-- dont need ORDER clause?? but see below
ORDER BY arm, armloc.fmin
;
  ]]></sql>
  </feature_sql>


  <feature_sql id="ests" type="feature_table"  output="ests.tsv">
  <script name="estaddpar" type="postprocess" shell="perl -pi.old" language="perl">
<![CDATA[
  #add dummy parent row for each new 'parent_oid nnn' line 
  if (/parent_oid\t(\S+)/ && !$didp{$1})  
    { $p=$1; $v=$_; $v=~s/(\S+)\tparent_oid/$p\tobject_oid/; $v=~s/match_part/match/; print $v; $didp{$p}=1; }
]]>
  </script>

  <sql><![CDATA[
-- feb05: chado est sql; separate from match & analysis 
-- AND exclude these feat types from analysis.tsv
-- match:,match_part: sim4:na_dbEST.same.dmel,sim4:na_dbEST.diff.dmel,sim4:na_DGC.in_process.dros

SELECT distinct
  armft.uniquename as arm,
  armloc.fmin, armloc.fmax, armloc.strand,
  armft.organism_id,
  'match_part:' || an.program || ':' || an.sourcename as type, 
  estft.name as name,
  estft.uniquename as id,   -- causing problems cuz not uniq among locations 
  partloc.feature_id AS oid, --  anfr.subject_id 
  text('parent_oid') AS attr_type, 
  anfr.object_id || partloc.srcfeature_id AS attribute
  -- partloc.srcfeature_id AS attribute -- this is not right == EST oid, same for all named est

FROM 
  feature armft, feature estft,
  featureloc armloc, featureloc partloc, 
  analysis an, analysisfeature anft, 
  feature_relationship anfr, 
  cvterm armcv, cv socv

WHERE 
  armcv.name in ( ${golden_path} ) 
  and armcv.cv_id = socv.cv_id and socv.name = '${seq_ontology}'
  and armcv.cvterm_id = armft.type_id  
  
  ${est_where_clause} 
  -- inverse of analysis_where_clause
  
  and anft.analysis_id = an.analysis_id
  and anft.feature_id = anfr.object_id
  and anfr.subject_id = armloc.feature_id  
  and armft.feature_id = armloc.srcfeature_id
  
  and anfr.subject_id = partloc.feature_id  
  and estft.feature_id = partloc.srcfeature_id
  and estft.type_id != armft.type_id  
  
ORDER BY arm, armloc.fmin
;
  ]]></sql>

  </feature_sql>
  
  
  
  <feature_sql id="analysis" type="feature_table" output="analysis.tsv">
  
  <!-- ant-like ; is this usable? -->
  <target name="main" depends="query,postprocess"/> 
  <target name="query" action="sql" /> 
  <target name="postprocess" action="rdump" description="fix parents w/o featureloc from kid values" /> 

  <!--  this works: perl -i.old rdump $r/tmp/featdump/analysis.tsv -->
  <script name="rdump" type="postprocess" shell="perl -i.old" language="perl">
<![CDATA[
my ($loid); my @r=();
while(<>) { 
  if (/^arm\t/) { print $_; next; } #header
  my @v= split "\t";
  my $oid= $v[-1]; 
  rdump() if ($loid && $oid ne $loid);  
  push(@r,\@v); $loid= $oid; 
  } 
open(ARGVOUT, ">>$ARGV"); select(ARGVOUT); # perl closes before this
rdump();

# nov04 bug in logic: can have dupl., successive oids on diff chromosomes: ortho matches !
# see below reinstated ORDER clause should fix; 
## ! NO - Order is bad; the ones needing fixing lack arm,fmin

sub rdump() {
  if (@r > 1) {
    my $par= $r[0]; 
    unless ($$par[0] && $$par[1] && $$par[2]) {
      # add kid range to parent if missing (likely)
      my $kid= $r[1]; #? check that $par oid == kids parent_oid ?
      my($a,$b,$e,$st,$org)= @{$kid}[0..4];
      foreach $kid (@r[2..$#r]) {
        my ($kb,$ke)= @{$kid}[1..2]; 
        $b=$kb if ($kb<$b); 
        $e=$ke if ($ke>$e); 
        }
      @{$par}[0..4] = ($a,$b,$e,$st,$org);
      if ($$par[-2] eq 'object_oid') { pop(@$par);  $$par[-1]="\n"; }  
      $r[0]= $par;
      }
    
    foreach my $i (1..$#r) { $r[$i]->[5] =~ s/^match:/match_part:/; }
    }
  foreach my $r (@r) { print join("\t",@$r); } 
  @r=(); 
} 

]]>
  </script>
  
  <!-- want sql tag at top level of feature_sql like others -->
  <sql><![CDATA[
-- select all an features of right type, with arm featureloc when avail
-- add analysis.sourcename where needed (ignore 'dummy');
-- change 'match_' leading type ; drop or make option?
-- ? problem for null sourcename? got an.program = pinsertion, null source, no type output
--  WHEN an.sourcename = 'dummy' THEN 'match:' || an.program
--  example where clauses from release configs:
--      est_where_clause="AND (an.program = 'sim4' AND
--        an.sourcename IN ('na_dbEST.same.dmel','na_dbEST.diff.dmel','na_DGC.in_process.dros'))"
--      analysis_where_clause="AND NOT (an.program = 'sim4' AND
--        an.sourcename IN ('na_dbEST.same.dmel','na_dbEST.diff.dmel','na_DGC.in_process.dros'))"
SELECT 
  armft.arm, armft.fmin, armft.fmax, armft.strand, 
  armft.organism_id,  
  CASE  
    WHEN (an.sourcename IS NULL OR an.sourcename = 'dummy') THEN 'match:' || an.program
    ELSE 'match:' || an.program || ':' || an.sourcename
  END 
  AS type, 
  targft.name AS name,  
  targft.uniquename AS id, 
  targft.feature_id AS oid,
  CASE WHEN attr.type IS NULL THEN text('object_oid')  ELSE attr.type  END 
  AS attr_type,
  CASE WHEN attr.attribute IS NULL THEN text(targft.feature_id) ELSE attr.attribute  END 
  AS attribute
FROM 
  feature targft left outer join gffatts_evid attr
    on (targft.feature_id = attr.feature_id),
  analysisfeature anf left outer join gffatts_anfloc armft
    on (anf.feature_id = armft.feature_id),
  analysis an
WHERE
  an.analysis_id = anf.analysis_id
  and anf.feature_id = targft.feature_id

  ${analysis_where_clause}
-- feb05: exclude ests; see above est_where_clause  
-- AND NOT (an.sourcename = 'sim4' AND an.program IN ('na_dbEST.same.dmel','na_dbEST.diff.dmel','na_DGC.in_process.dros') )

-- NEED ORDER to get parent/match_part oids aligned
-- NO, cant use ORDER cause of missing arm, fmin for parent feats fixed in rdump above
-- ORDER BY armft.arm, armft.fmin
;
  ]]></sql>
  </feature_sql>


<!-- featureprops  user lists 
 select cv.name,type_id,count(fp.featureprop_id) 
 from cvterm cv,featureprop fp where fp.type_id = cv.cvterm_id
 group by cv.name,type_id;

sgdlite=#
        name        | type_id | count 
 Note               |       2 | 19206  * keep
 gene               |     124 | 10022  x same as synonym
 orf_classification |   20614 | 13790  * keep

chado_dmel_r4_1_13-#  
           name            | type_id | count  
 aminoacid                 |       3 |    223
 anticodon                 |       4 |    185
 bound_moiety              |   57126 |    102
 citation                  |   57127 |   2188
 citation                  |   57633 |     10
 comment                   |       5 |  11545
 comment                   |   57156 |   1453
 cyto_range                |      21 |  15523   * keep
 description               |       7 | 699692
 dicistronic               |      22 |     53 ??
 element                   |       8 |   1365
 element                   |   55064 |      1
 encoded_symbol            |       9 |   1310
 evidence                  |   57134 |   5536
 evidence                  |   57267 |      8
 evidenceGB                |   59009 |     32
 gbunit                    |      20 |  15570   * keep
 linked_to                 |   57157 |    372
 missing_start_codon       |   57557 |      1
 missing_stop_codon        |   57558 |      1
 na_change                 |   57158 |    493
 non_canonical_start_codon |   57556 |     19
 owner                     |      11 |  19412
 phenotype                 |   57145 |      3
 pr_change                 |   57159 |    469
 problem                   |      12 |    566
 product                   |   57146 |      2
 protein_id                |      13 |  16959
 readthrough_stop_codon    |   57553 |      4
 reported_na_change        |   57160 |     81
 reported_pr_change        |   57161 |    142
 sp_comment                |      15 |  17915
 sp_status                 |      16 |  13658
 standard_name             |   57151 |      9
 status                    |      17 |    459
 symbol                    |      18 |  12310
 tag                       |     610 |      1
 validation_flag           |   57034 |   1552

 select cv.name,fp.type_id,count(fp.featureprop_id) 
 from cvterm cv,featureprop fp, feature f
 where fp.type_id = cv.cvterm_id 
 and fp.feature_id = f.feature_id and f.organism_id = 4  
 group by cv.name,fp.type_id;

chado_dpse_r20_1-# 
  name   | type_id | count 
 Alias   |   59645 | 10374
 comment |       5 |  5585
 gbunit  |      20 | 12197   * keep

-->

  <feature_sql id="views" type="view">
  <sql><![CDATA[
-- from sequence-gff-views.sql 
-- dont need all of these attribs;
--   dbxref: yes, cvterm: no, synonym: no?, pub: no, 
--   featureprop - want some: cyto_range, gbunit?(no)
-- add dbxref_2nd, aug04 for dbxref need to know primary/secondary > feature_dbxref.is_current !

-- attr view for regular features which may have parent features (exons)
CREATE OR REPLACE VIEW gffattr_gmodel (
    feature_id, type, attribute
) AS
  SELECT feature_id,  
    CASE WHEN fs.is_current IS FALSE THEN 'dbxref_2nd' ELSE 'dbxref' END AS type, 
    d.name || ':' || s.accession AS attribute
  FROM dbxref s, feature_dbxref fs, db d
  WHERE fs.dbxref_id = s.dbxref_id and s.db_id = d.db_id

UNION ALL
  SELECT feature_id, cv.name AS type, fp.value AS attribute
  FROM featureprop fp, cvterm cv
  WHERE fp.type_id = cv.cvterm_id
    and (cv.name in ( ${featureprops} ))
    -- keep this restriction - other props not useful here: comments, sp_comment, owner ...
    -- mar05: add 'putative_ortholog_of' cv for dpse/..; this is in feature_relationship

UNION ALL
  SELECT feature_id,
    CASE WHEN fs.is_current IS FALSE THEN 'synonym_2nd' ELSE 'synonym' END AS type, 
    s.synonym_sgml AS attribute
  FROM feature_synonym fs, synonym s
  WHERE fs.synonym_id = s.synonym_id and fs.is_internal IS FALSE

UNION ALL
  -- add parent feat ids for exons, etc.
  -- putative_ortholog_of=dmel/gene:CG31648
  SELECT pk.subject_id AS feature_id, cv.name AS type, 
    '${GFF_source}:' || fo.uniquename  AS attribute
  FROM feature_relationship pk, feature fo, cvterm cv
  WHERE 
    cv.name = 'putative_ortholog_of' and pk.type_id = cv.cvterm_id 
    and pk.object_id = fo.feature_id
    
UNION ALL
  -- add parent feat ids for exons, etc.
  SELECT pk.subject_id AS feature_id, 'parent_oid' AS type, 
    CASE  
      WHEN pk.rank IS NULL THEN text(pk.object_id)
      ELSE pk.object_id || ':' || pk.rank
    END 
  FROM feature_relationship pk
;
GRANT SELECT ON gffattr_gmodel TO PUBLIC;


-- seqlen/featloc-length view
-- CREATE OR REPLACE VIEW feature_length ( feature_id, flength ) AS
--   SELECT feature_id,  
--     CASE WHEN seqlen IS NULL THEN (fl.fmax - fl.fmin) ELSE  fs.seqlen END AS flength, 
--   FROM featureloc fl
--   WHERE fl.feature_id = feature_id 
-- ;

-- attr view for match features
CREATE OR REPLACE VIEW gffattr_match (
    feature_id,  type,  attribute
) AS
  SELECT feature_id,  
    CASE WHEN fs.is_current IS FALSE THEN 'dbxref_2nd' ELSE 'dbxref' END AS type, 
    d.name || ':' || s.accession AS attribute
  FROM dbxref s, feature_dbxref fs, db d
  WHERE fs.dbxref_id = s.dbxref_id and s.db_id = d.db_id

UNION ALL
  SELECT feature_id, cv.name AS type, fp.value AS attribute
  FROM featureprop fp, cvterm cv
  WHERE fp.type_id = cv.cvterm_id
    and (cv.name in ( ${featureprops} )) 

UNION ALL
  SELECT feature_id,
    CASE WHEN fs.is_current IS FALSE THEN 'synonym_2nd' ELSE 'synonym' END AS type, 
    s.synonym_sgml AS attribute
  FROM feature_synonym fs, synonym s
  WHERE fs.synonym_id = s.synonym_id and fs.is_internal IS FALSE
;
GRANT SELECT ON gffattr_match TO PUBLIC;

-- attrib view for cross-species feats (syntenic_region, orthology)
CREATE OR REPLACE VIEW gffattr_synteny (
    feature_id, type,  attribute
) AS
  --  parent feat ids for source supercontigs, etc.
  SELECT pk.subject_id, text('parent_oid') as type, 
    CASE  
      WHEN pk.rank IS NULL THEN text(pk.object_id)
      ELSE pk.object_id || ':' || pk.rank
    END 
  FROM feature_relationship pk
;
GRANT SELECT ON gffattr_synteny TO PUBLIC;

-- use this one instead of above
CREATE OR REPLACE VIEW gffattr_synt2 (
    feature_id,  type,  attribute
) AS
  SELECT feature_id, text('to_species') AS type, text(targ.organism_id) AS attribute
  FROM feature targ

-- see above orthofix.pl: add case when this select is missing, use ortho featloc
-- to find equal gene feature and putative_ortholog_of 
UNION ALL
  SELECT feature_id, text('to_name') AS type, 
    CASE WHEN targ.uniquename = targ.name THEN targ.name 
    ELSE targ.name || ',' || targ.uniquename 
    END AS attribute 
  FROM feature targ
  WHERE NOT( targ.type_id IN (
    select cvterm_id from cvterm 
    where name in ( ${golden_path} )   
    ) )
;
GRANT SELECT ON gffattr_synt2 TO PUBLIC;


-- for analysis features
-- problem where w/ some analysis features - promotor, transposon
-- ? need to restrict armcv to cv_id = SO id - e.g. find chromosome in 4 cv's
CREATE OR REPLACE VIEW gffatts_anfloc (
    feature_id, arm, fmin, fmax, strand, organism_id
  ) AS
  SELECT 
    armloc.feature_id,  
    armft.uniquename as arm,
    armloc.fmin, armloc.fmax, armloc.strand, armft.organism_id
  FROM  feature armft, featureloc armloc, cvterm armcv, cv socv
  WHERE 
    armft.type_id = armcv.cvterm_id
    and armcv.name in ( ${golden_path} ) 
    and armcv.cv_id = socv.cv_id and socv.name = '${seq_ontology}'
    and armft.feature_id = armloc.srcfeature_id
;
GRANT SELECT ON gffatts_anfloc TO PUBLIC;

-- for analysis features
CREATE OR REPLACE VIEW gffatts_evid (
    feature_id, type, attribute
) AS
  SELECT pk.subject_id, text('parent_oid'), text(pk.object_id)
  FROM feature_relationship pk
;
GRANT SELECT ON gffatts_evid TO PUBLIC;

  ]]></sql>
  </feature_sql>
</opt>