File: sam.lang

package info (click to toggle)
biosyntax 1.0.0b-6
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 6,020 kB
sloc: sh: 876; javascript: 292; xml: 246; makefile: 52
file content (224 lines) | stat: -rw-r--r-- 6,206 bytes
parent folder | download | duplicates (4)
# Sam language definition file for less pipe
# bioSyntax v0.1
# Depends on sam.style + bioSyntax.outlang

# Variables ----------------------------------
vardef NUCLEOTIDE = '[ATCGNXatcgnxRYSWKM]'
vardef PHRED = '[!-K]' #PHRED 1.8

# Header-section -----------------------------
environment header start '^(?=@)' begin

  # Standard Headers HD | SQ | PG
  environment header_hd delim '(?=@(HD|SQ|PG))' '$' begin
    header = '@[HSPC][DQGO]'
    
    #HD
    numeric = '(?<=VN:)\S*'
    mal = '(?<=SO:)(unsorted|unknown)'
    bon = '(?<=SO:)(coordinate|queryname)'
    mal = '(?<=SO:)\S*'

    mal = '(?<=GO:)(none|)'
    bon = '(?<=GO:)(query|reference)'
    mal = '(?<=GO:)\S*'

    #SQ
    chr = '(?<=SN:)\S*'
    chrStart = '(?<=LN:)\S*'
    chr = '(?<=AH:)\S*'
    chr = '(?<=AN:)\S*'
    chr = '(?<=AS:)\S*'
    string = '(?<=M5:)\S*'
    chr = '(?<=SP:)\S*'
    numeric = '(?<=UR:)\S*'

    #PG
    software = '(?<=ID:)\S*'
    software = '(?<=PN:)\S*'
    commandline = '(?<=CL:)[\S ]*'

    software = '(?<=PP:)\S*'
    string = '(?<=DS:)\S*'
    #numeric = '(?<=VN:)\S*'
  end

  # Readgroup Header
  environment header_co delim '(?=@RG)' '$' begin
    header = '@RG'

    sample = '(?<=ID:)\S*'
    string = '(?<=CN:)\S*'
    string = '(?<=DS:)\S*'
    numeric = '(?<=DT:)\S*'
    string = '(?<=FO:)\S*'
    string = '(?<=KS:)\S*'
    sample = '(?<=LB:)\S*'
    software = '(?<=PG:)\S*'
    numeric = '(?<=PI:)\S*'
    string = '(?<=PL:)\S*'
    string = '(?<=PM:)\S*'
    string = '(?<=PU:)\S*'
    sample = '(?<=SM:)\S*'

  end

  # Comment Header
  environment header_co delim '(?=@CO)' '$' begin
    header = '@CO'
    comment = '.*'

  end

end #Header
# -------------------------------------------
# align-section -----------------------------

environment align start '(?=^[^$])' begin
  drop = '$' exitall # stops multi-line searching

  # Column 1: CHROM
  string = '^\S+'

  # Column 2: FLAG
  environment push2 = '\t?' begin
    drop = '(?=$)' exitall
    # Read Flag
    # If flag > 512
    #   read is QC-fail | PCRdup | 2'

      numeric = '\A[0-9]{1,2}(?=\t)'
      numeric = '\A[1-4][0-9]{1,2}(?=\t)'
      numeric = '\A[5][0][0-9](?=\t)'
      numeric = '\A[5][1][01](?=\t)'

      # Flag > 512 (QC fail / PCR dup / Supp Read)
      string = '\A\d\d\d+'

    # Column 3: RNAME
    environment push3 = '\t?' begin
      drop = '(?=$)' exitall

      chr = '\A\S+'

      # Column 4: POSition
      environment push4 = '\t?' begin
        drop = '(?=$)' exitall

        chrStart = '\A\d+'

        # Column 5: MAPQuality
        environment push5 = '\t?' begin
          drop = '(?=$)' exitall

          # MAPQ Scale (-log10 P)
          # Incriment size = 10

          gradbw1 = '\A255(?=\t)' # MAPQ NA

          gradbw1 = '\A[0-9](?=\t)'
          grasbw6 = '\A[1][0-9]'
          gradbw8 = '\A[2][0-9]'
          gradbw10 = '\A[3][0-9]'
          gradbw10i = '\A[4-9][0-9]'
         
          gradbw1 = '\A\d+'

          # Column 6: CIGAR string
          environment push6 = '\t?' begin
            drop = '(?=$)' exitall


            # CIGAR Strings
            # M: Alignment match
            # I: Insertion
            # D: Deletion
            # N: Skipped region (intron)
            # S: Soft Clip
            # H: Hard Clip
            # P: Padding
            # =: Sequence Match
            # X: Sequence Mismatch
            # *: String Absent

            clip      = '\A\d++[NSHP]' # clips
            default   = '\A\d++[M=]'  # match
            mismatch  = '\A\d++[X]'   # mismatch
            insertion =  '\A\d++[I]'  # insertion
            deletion  = '\A\d++[D]'   # deletion

            comment = '\A\S++'      # else

            
            # Column 7: RNEXT & PNEXT
            environment push7 = '\t?' begin
              drop = '(?=$)' exitall

              # Match Chromosome
              chrStart = '\A=\t\d+\t'

              # Unmapped Rnext
              string = '\A\\*\t\d+\t'

              # Different Chromosome Rnext
              string = '\A\S+\t\d+\t'

              # Column 8: TLEN
              environment push8 = '(?=\d)' begin
                drop = '(?=$)' exitall

                numeric = '\d+'

                # Column 9: Sequence Data
                environment push9 = '\t?' begin
                  drop = '(?=$)' exitall

                  ntA = '\A[Aa]++'
                  ntT = '\A[TtUu]++'
                  ntG = '\A[Gg]++'
                  ntC = '\A[Cc]++'
                  ntN = '\A[NnXx]++'
                  ntN = '\A[RrYySsWwKkMm]++'

                  # Column 10: PHRED
                  environment push10 = '\t?' begin
                    drop = '(?=$)' exitall

                    # Illumina 1.8 Scale
                    # !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJ...j
                    # |   |   |   |   |   |   |   |   |   |...
                    gradbw1 = '\A[!-%]++'
                    gradbw3 = '\A[!-(]++'
                    gradbw6 = '\A[)-0]++'
                    gradbw8 = '\A[1-8]++'
                    gradbw10 = '\A[9-@]++'
                    gradbw10b = '\A[A-D]++'
                    gradbw10i = '\A[E-j]++'

                    # Column 11+ Tags
                    environment push11 = '\t' begin
                      drop = '(?=$)' exitall

                      # AB:i:1337
                        # A Printable
                        # i signed integer
                        # f floating point
                        # Z string with spaces
                        # H byte array hex
                        # B Integer/Numeric Array 

                      string = '[A-Z]{2}:'
                      (comment,numeric) = `([ifH]:)(\S*)`
                      (comment,keyword) = `([AZB]:)(\S*)`

                    end #col11+
                  end #col10
                end #col9
              end #col8
            end #col7
          end #col6
        end # col5
      end # col4
    end # col3
  end # col2
end # align section