#!usr/bin/perl -w # SNPatron v2.0 # # NOTES: input FASTA file MUST have UNIX line ends # also sequence CANNOT hold any characters other than IUBs and '-'s ############################ # User entered information # ############################ print "Enter the full multi-FASTA filename: "; my $dnaseqfilename = ; chomp $dnaseqfilename; # Remove 3-letter extension from filename # (remainder used as stem for output files) my $filename = substr($dnaseqfilename,0,((length $dnaseqfilename) - 4)); print "Does the sequence contain heterozygotes? (yes = 1, no = 0): "; my $make_haps = ; chomp $make_haps; ########################## # Input multi-FASTA file # ########################## open(DNASEQS, $dnaseqfilename); @dna = ; close DNASEQS; ######################### # Read in sequence data # # Pull out seq names # ######################### my @hap_seqs; my $seq_names; # if input sequences are homozygous if($make_haps == 0) { @hap_seqs = SepFastaSeqs(@dna); $seq_names = ExtractSeqNames(\$make_haps, \@dna); } # if input sequence are heterozygous, construct pseudohaplotypes # (heterozygous bases split arbitrarily between 'haplotypes') if($make_haps == 1) { my(@split_seqs) = SepFastaSeqs(@dna); @hap_seqs = PseudoHaps(@split_seqs); $seq_names = ExtractSeqNames(\$make_haps, \@dna); } ################################# # Make/Print consensus sequence # ################################# my($consensus) = MakeConsensus(@hap_seqs); PrintFormattedSeq($consensus, $filename); ######################## # Find SNPs and Indels # ######################## # Locate all bi- and tri-allelic SNPs # output to file "filename_SNPS.txt" # (data passed to subroutine by reference) SnpFinder(\$filename, \$seq_names, \$consensus, \@hap_seqs); # Locate all simple bi-allelic InDels, and more complex events # output to file "filename_INDELS.txt" # (data passed to subroutine by reference) InDelFinder(\$filename, \$seq_names, \$consensus, \@hap_seqs); exit; ####################################################################### # # # # # SUBROUTINES BELOW # # # # # ####################################################################### #################### # READ IN: multi-FASTA array # FUNCTION: move each sequence to a separate array element # remove whitespace from each sequence # capitalize all nucleotides # NEEDS: uses the 'CapitalizeBases' subroutine #################### sub SepFastaSeqs { my(@inputseq)=@_; my($c_head) = -1; my(@seqs)=''; foreach my $line (@inputseq) { if($line =~ /^>/) { ++$c_head; } else { $line =~ s/\s//g; #remove whitespace $line = CapitalizeBases($line); #capitalize all bases $seqs[$c_head] .= $line; } } return @seqs; } #################### # READ IN: multi-FASTA array # FUNCTION: extract names of all sequences, output directly # for no pseudohaps, and for pseudohaps, duplicate # each name, adding '_hapA' and '_hapB' extensions #################### sub ExtractSeqNames { my($h_code, $input)=@_; #multi-FASTA in, and code for whether will be making pseudohaps my($names)=''; #string of sequence names foreach my $line (@$input) { # the .*? part ensures name pulled out whether or not # a space is present between '>' and 'name' if($line =~ /^>.*?([a-z,A-Z,0-9,-,_,.]+)/) { # no pseudohaplotypes - just extract names if($$h_code == 0) { $names .= "$1\t"; } # pseudohaplotypes - print each name twice in succession, # followed by '_hapA' and 'hapB' if($$h_code == 1) { $names .= "$1_hapA\t$1_hapB\t"; } } } chop $names; return($names); } #################### # READ IN: a sequence string # FUNCTION: alter all lowercase-letter bases to uppercase #################### sub CapitalizeBases { my($lowercase)=@_; my $uppercase = $lowercase; $uppercase =~ tr/a-z/A-Z/; return $uppercase; } #################### # READ IN: array of sequence strings # FUNCTION: change each input sequence into a pair of pseudohaplotypes # heterozygote alleles are arbitrarily split across pseudohaps #################### sub PseudoHaps { my(@in_seqs) = @_; my($nt_query); my(@pse_haps); for(my $index = 0; $index < scalar @in_seqs; ++$index) { for(my $bp = 0; $bp < length($in_seqs[$index]); ++$bp) { $nt_query = substr($in_seqs[$index],$bp,1); if($nt_query =~ /N|A|C|T|G|-/) { @pse_haps[$index*2] .= $nt_query; @pse_haps[(($index+1)*2)-1] .= $nt_query; } if($nt_query =~ /R/) { @pse_haps[$index*2] .= 'A'; @pse_haps[(($index+1)*2)-1] .= 'G'; } if($nt_query =~ /Y/) { @pse_haps[$index*2] .= 'C'; @pse_haps[(($index+1)*2)-1] .= 'T'; } if($nt_query =~ /M/) { @pse_haps[$index*2] .= 'A'; @pse_haps[(($index+1)*2)-1] .= 'C'; } if($nt_query =~ /K/) { @pse_haps[$index*2] .= 'T'; @pse_haps[(($index+1)*2)-1] .= 'G'; } if($nt_query =~ /S/) { @pse_haps[$index*2] .= 'G'; @pse_haps[(($index+1)*2)-1] .= 'C'; } if($nt_query =~ /W/) { @pse_haps[$index*2] .= 'A'; @pse_haps[(($index+1)*2)-1] .= 'T'; } } } return(@pse_haps); } #################### # READ IN: array of sequence strings # FUNCTION: for each site, compare alleles across all sequences, output consensus # NEEDS: uses the 'PairedConsensus' subroutine #################### sub MakeConsensus { my(@seqs)=@_; my $bp_total = length($seqs[0]); #alignment length my $t_nuc; #a given base in 1st sequence my $nt_pair; #two-base string (1st_seq.Nth_seq) my $consen = ''; # loop through sites; compare base in 1st seq with equivalent # base in all other sequences; keep overall consensus base for(my $bp = 0; $bp < $bp_total; ++$bp) { $t_nuc = substr($seqs[0],$bp,1); for(my $t_seq = 1; $t_seq < scalar @seqs; ++$t_seq) { $nt_pair = $t_nuc.substr($seqs[$t_seq],$bp,1); $t_nuc = PairedConsensus($nt_pair); } $consen .= $t_nuc; } return($consen); } #################### # READ IN: a two-letter sequence string # (representing two alleles for a given site) # FUNCTION: return the consensus base # IUB rules apply # any InDel variation ('-') at a site yields a lowercase consensus #################### sub PairedConsensus { my($duplet) = @_; # hyphen '-' is a gap if($duplet =~ /-N|N-|--/) {return '-'} # N's ignored elsif($duplet =~ /AA|NA|AN/) {return 'A'} elsif($duplet =~ /CC|NC|CN/) {return 'C'} elsif($duplet =~ /GG|NG|GN/) {return 'G'} elsif($duplet =~ /TT|NT|TN/) {return 'T'} # gaps turn nucleotide to lowercase elsif($duplet =~ /-A|A-/) {return 'a'} elsif($duplet =~ /-B|B-/) {return 'b'} elsif($duplet =~ /-C|C-/) {return 'c'} elsif($duplet =~ /-D|D-/) {return 'd'} elsif($duplet =~ /-G|G-/) {return 'g'} elsif($duplet =~ /-H|H-/) {return 'h'} elsif($duplet =~ /-K|K-/) {return 'k'} elsif($duplet =~ /-M|M-/) {return 'm'} elsif($duplet =~ /-R|R-/) {return 'r'} elsif($duplet =~ /-S|S-/) {return 's'} elsif($duplet =~ /-T|T-/) {return 't'} elsif($duplet =~ /-V|V-/) {return 'v'} elsif($duplet =~ /-W|W-/) {return 'w'} elsif($duplet =~ /-Y|Y-/) {return 'y'} # triplet and quadruplet (N) nucleotides elsif($duplet =~ /KC|BC|YG|BG|ST|BT|BN/) {return 'B'} elsif($duplet =~ /KY|SY|BY|CK|YK|SK|BK/) {return 'B'} elsif($duplet =~ /YS|KS|BS|CB|GB|TB|NB/) {return 'B'} elsif($duplet =~ /KB|SB|BB|YB|TS|GY/) {return 'B'} elsif($duplet =~ /KA|DA|WG|DG|RT|DT|DN/) {return 'D'} elsif($duplet =~ /TR|KR|WR|DR|AK|RK|WK/) {return 'D'} elsif($duplet =~ /DK|GW|RW|KW|DW|AD|GD/) {return 'D'} elsif($duplet =~ /TD|ND|RD|KD|WD|DD/) {return 'D'} elsif($duplet =~ /YA|HA|WC|HC|MT|HT|HN/) {return 'H'} elsif($duplet =~ /AY|MY|WY|HY|TM|YM|WM/) {return 'H'} elsif($duplet =~ /HM|CW|YW|MW|HW|AH|CH/) {return 'H'} elsif($duplet =~ /TH|NH|YH|MH|WH|HH/) {return 'H'} elsif($duplet =~ /SA|VA|RC|VC|MG|VG|VN/) {return 'V'} elsif($duplet =~ /CR|MR|SR|VR|GM|RM|SM/) {return 'V'} elsif($duplet =~ /VM|AS|RS|MS|VS|AV|CV/) {return 'V'} elsif($duplet =~ /GV|NV|RV|MV|SV|VV/) {return 'V'} elsif($duplet =~ /TG|KG|GT|KT|KN/) {return 'K'} elsif($duplet =~ /GK|TK|NK|KK/) {return 'K'} elsif($duplet =~ /CA|MA|AC|MC|MN/) {return 'M'} elsif($duplet =~ /AM|CM|NM|MM/) {return 'M'} elsif($duplet =~ /GA|RA|AG|RG|RN/) {return 'R'} elsif($duplet =~ /AR|GR|NR|RR/) {return 'R'} elsif($duplet =~ /GC|SC|CG|SG|SN/) {return 'S'} elsif($duplet =~ /CS|GS|NS|SS/) {return 'S'} elsif($duplet =~ /TA|WA|AT|WT|WN/) {return 'W'} elsif($duplet =~ /AW|TW|NW|WW/) {return 'W'} elsif($duplet =~ /TC|YC|CT|YT|YN/) {return 'Y'} elsif($duplet =~ /CY|TY|NY|YY/) {return 'Y'} elsif($duplet =~ /BA|DC|HG|VT|NN|YR|BR/) {return 'N'} elsif($duplet =~ /HR|RY|DY|VY|KM|BM|DM/) {return 'N'} elsif($duplet =~ /MK|HK|VK|WS|DS|HS|SW/) {return 'N'} elsif($duplet =~ /BW|VW|AB|RB|MB|WB|DB/) {return 'N'} elsif($duplet =~ /HB|VB|CD|YD|MD|SD|BD/) {return 'N'} elsif($duplet =~ /HD|VD|GH|RH|KH|SH|BH/) {return 'N'} elsif($duplet =~ /DH|VH|TV|YV|KV|WV|BV/) {return 'N'} elsif($duplet =~ /DV|HV/) {return 'N'} # deals with input gap heterozygotes elsif($duplet =~ /Aa|Na|-a|aa|a-|aN|aA/) {return 'a'} elsif($duplet =~ /Cc|Nc|-c|cc|c-|cN|cC/) {return 'c'} elsif($duplet =~ /Gg|Ng|-g|gg|g-|gN|gG/) {return 'g'} elsif($duplet =~ /Tt|Nt|-t|tt|t-|tN|tT/) {return 't'} elsif($duplet =~ /Cb|Gb|Tb|Nb|Yb|Kb|Sb/) {return 'b'} elsif($duplet =~ /Bb|-b|cb|gb|tb|yb|kb/) {return 'b'} elsif($duplet =~ /sb|bb|Ts|Ys|Ks|Bs|ts/) {return 'b'} elsif($duplet =~ /ys|ks|bs|Gy|Ky|Sy|By/) {return 'b'} elsif($duplet =~ /gy|ky|sy|by|Ck|Yk|Sk/) {return 'b'} elsif($duplet =~ /Bk|ck|yk|sk|bk|St|Bt/) {return 'b'} elsif($duplet =~ /st|bt|Kc|Bc|kc|bc|Yg/) {return 'b'} elsif($duplet =~ /Bg|yg|bg|b-|cK|yK|sK/) {return 'b'} elsif($duplet =~ /bK|tS|yS|kS|bS|cB|gB/) {return 'b'} elsif($duplet =~ /tB|yB|kB|sB|bB|gY|kY/) {return 'b'} elsif($duplet =~ /sY|bY|bN|kC|bC|yG|bG|sT|bT/) {return 'b'} elsif($duplet =~ /Ad|Gd|Td|Nd|Rd|Kd|Wd/) {return 'd'} elsif($duplet =~ /Dd|-d|ad|gd|td|rd|kd/) {return 'd'} elsif($duplet =~ /wd|dd|Gw|Rw|Kw|Dw|gw/) {return 'd'} elsif($duplet =~ /rw|kw|dw|Ak|Rk|Wk|Dk/) {return 'd'} elsif($duplet =~ /ak|rk|wk|dk|Rt|Dt|rt/) {return 'd'} elsif($duplet =~ /dt|Tr|Kr|Wr|Dr|tr|kr/) {return 'd'} elsif($duplet =~ /wr|dr|Wg|Dg|wg|dg|Ka/) {return 'd'} elsif($duplet =~ /Da|ka|da|aD|gD|tD|rD/) {return 'd'} elsif($duplet =~ /kD|wD|dD|d-|aK|rK|wK/) {return 'd'} elsif($duplet =~ /dK|gW|rW|kW|dW|dN|tR/) {return 'd'} elsif($duplet =~ /kR|wR|dR|kA|dA|wG|dG|rT|dT/) {return 'd'} elsif($duplet =~ /Ah|Ch|Th|Nh|Yh|Mh|Wh/) {return 'h'} elsif($duplet =~ /Hh|-h|ah|ch|th|yh|mh/) {return 'h'} elsif($duplet =~ /wh|hh|Cw|Yw|Mw|Hw|cw/) {return 'h'} elsif($duplet =~ /yw|mw|hw|Ay|My|Wy|Hy/) {return 'h'} elsif($duplet =~ /ay|my|wy|hy|Tm|Ym|Wm/) {return 'h'} elsif($duplet =~ /Hm|tm|ym|wm|hm|Mt|Ht/) {return 'h'} elsif($duplet =~ /hW|aY|mY|wY|hY|tM|yM/) {return 'h'} elsif($duplet =~ /wM|hM|hN|yA|hA|wC|hC|mT|hT/) {return 'h'} elsif($duplet =~ /mH|wH|hH|h-|cW|yW|mW/) {return 'h'} elsif($duplet =~ /Ha|ya|ha|aH|cH|tH|yH/) {return 'h'} elsif($duplet =~ /mt|ht|Wc|Hc|wc|hc|Ya/) {return 'h'} elsif($duplet =~ /Gk|Tk|Nk|Kk|-k|gk|tk/) {return 'k'} elsif($duplet =~ /tg|kg|k-|gK|tK|kK|kN/) {return 'k'} elsif($duplet =~ /kk|Gt|Kt|gt|kt|Tg|Kg/) {return 'k'} elsif($duplet =~ /tG|kG|gT|kT/) {return 'k'} elsif($duplet =~ /Am|Cm|Nm|Mm|-m|am|cm/) {return 'm'} elsif($duplet =~ /mm|Ac|Mc|ac|mc|Ca|Ma/) {return 'm'} elsif($duplet =~ /ca|ma|m-|aM|cM|mM|mN/) {return 'm'} elsif($duplet =~ /cA|mA|aC|mC/) {return 'm'} elsif($duplet =~ /Ar|Gr|Nr|Rr|-r|ar|gr/) {return 'r'} elsif($duplet =~ /rr|Ag|Rg|ag|rg|Ga|Ra/) {return 'r'} elsif($duplet =~ /ga|ra|r-|rN|aR|gR|rR/) {return 'r'} elsif($duplet =~ /gA|rA|aG|rG/) {return 'r'} elsif($duplet =~ /Cy|Ty|Ny|Yy|-y|cy|ty/) {return 'y'} elsif($duplet =~ /yy|Ct|Yt|ct|yt|Tc|Yc/) {return 'y'} elsif($duplet =~ /tc|yc|y-|cY|tY|yY|yN/) {return 'y'} elsif($duplet =~ /tC|yC|cT|yT/) {return 'y'} elsif($duplet =~ /Aw|Tw|Nw|Ww|-w|aw|tw/) {return 'w'} elsif($duplet =~ /ww|At|Wt|at|wt|Ta|Wa/) {return 'w'} elsif($duplet =~ /ta|wa|w-|aW|tW|wW|wN/) {return 'w'} elsif($duplet =~ /tA|wA|aT|wT/) {return 'w'} elsif($duplet =~ /Cs|Gs|Ns|Ss|-s|cs|gs/) {return 's'} elsif($duplet =~ /ss|Gc|Sc|gc|sc|Cg|Sg/) {return 's'} elsif($duplet =~ /cg|sg|s-|cS|gS|sS|sN/) {return 's'} elsif($duplet =~ /gC|sC|cG|sG/) {return 's'} elsif($duplet =~ /Av|Cv|Gv|Nv|Rv|Mv|Sv/) {return 'v'} elsif($duplet =~ /Vv|-v|av|cv|gv|rv|mv/) {return 'v'} elsif($duplet =~ /sv|vv|As|Rs|Ms|Vs|as/) {return 'v'} elsif($duplet =~ /rs|ms|vs|Gm|Rm|Sm|Vm/) {return 'v'} elsif($duplet =~ /gm|rm|sm|vm|Cr|Mr|Sr/) {return 'v'} elsif($duplet =~ /Va|sa|va|aV|cV|gV|rV/) {return 'v'} elsif($duplet =~ /mV|sV|vV|v-|aS|rS|mS/) {return 'v'} elsif($duplet =~ /rc|vc|Mg|Vg|mg|vg|Sa/) {return 'v'} elsif($duplet =~ /Vr|cr|mr|sr|vr|Rc|Vc/) {return 'v'} elsif($duplet =~ /vS|gM|rM|sM|vM|vN|cR/) {return 'v'} elsif($duplet =~ /mR|sR|vR|sA|vA|rC|vC|mG|vG/) {return 'v'} elsif($duplet =~ /Ab|Rb|Mb|Wb|Db|Hb|Vb/) {return 'N'} elsif($duplet =~ /ab|rb|mb|wb|db|hb|vb/) {return 'N'} elsif($duplet =~ /Cd|Yd|Md|Sd|Bd|Hd|Vd/) {return 'N'} elsif($duplet =~ /cd|yd|md|sd|bd|hd|vd/) {return 'N'} elsif($duplet =~ /Gh|Rh|Kh|Sh|Bh|Dh|Vh/) {return 'N'} elsif($duplet =~ /gh|rh|kh|sh|bh|dh|vh/) {return 'N'} elsif($duplet =~ /Tv|Yv|Kv|Wv|Bv|Dv|Hv/) {return 'N'} elsif($duplet =~ /tv|yv|kv|wv|bv|dv|hv/) {return 'N'} elsif($duplet =~ /Sw|Bw|Vw|sw|bw|vw|Ws/) {return 'N'} elsif($duplet =~ /mk|hk|vk|Km|Bm|Dm|km/) {return 'N'} elsif($duplet =~ /km|bm|dm|Vt|vt|Yr|Br/) {return 'N'} elsif($duplet =~ /Vy|ry|dy|vy|Mk|Hk|Vk/) {return 'N'} elsif($duplet =~ /Ds|Hs|ws|ds|hs|Ry|Dy/) {return 'N'} elsif($duplet =~ /hR|bA|dC|hG|vT/) {return 'N'} elsif($duplet =~ /rB|mB|wB|dB|hB|vB|rY/) {return 'N'} elsif($duplet =~ /wS|dS|hS|sW|bW|vW|aB/) {return 'N'} elsif($duplet =~ /sH|bH|dH|vH|tV|yV|kV/) {return 'N'} elsif($duplet =~ /sD|bD|hD|vD|gH|rH|kH/) {return 'N'} elsif($duplet =~ /Hg|hg|Ba|ba|cD|yD|mD/) {return 'N'} elsif($duplet =~ /Hr|yr|br|hr|Dc|dc|Hg/) {return 'N'} elsif($duplet =~ /wV|bV|dV|hV|mK|hK|vK/) {return 'N'} elsif($duplet =~ /dY|vY|kM|bM|dM|yR|bR/) {return 'N'} } #################### # READ IN: consensus sequence string # filename stem of input multi_FASTA # FUNCTION: print consensus sequence to file # set to 80nt lines split into 10nt sections # file named after input multi-FASTA file #################### sub PrintFormattedSeq { my($rawseq,$name)=@_; my($len_line)=80; #SET for different line lengths my($sec_size)=10; #SET for different section lengths my $output = $name."_consensus.txt"; #output file name open(CONS, ">$output"); my $tmp_line; #temporarily holds a single sequence line # set first line of file as input filename print CONS "> $name\n"; # go through consensus sequence and print to file for(my $pos=0;$pos < length($rawseq);$pos += $len_line) { # print nt position at start of line print CONS $pos+1,"\t"; # store current line of sequence # print it in short sections separated by spaces # if/else ensures that final section does not have a space after it $tmp_line = substr($rawseq,$pos,$len_line); for(my $line_pos=0; $line_pos < length($tmp_line); $line_pos += $sec_size) { if($line_pos == ($len_line - $sec_size)) { print CONS substr($tmp_line,$line_pos,$sec_size); } else { print CONS substr($tmp_line,$line_pos,$sec_size)," "; } } print CONS "\n"; } close CONS; } #################### # READ IN: filename stem of input multi_FASTA # consensus sequence string # array of sequence strings # FUNCTION: check each base of consensus for bi- and triallelic SNPs # such polymorphisms appearing on the background of an InDel # event are not scored # for each SNP, output the polymorphism type (bi- or tri-), # the position, the IUB, the major and minor alleles, the number # of major and minor alleles seen, the minor allele frequency # up- and downstream OLA oligos (only for bi-), GC% for each # oligo, and the allelic state for each sequence # NEEDS: uses the 'FreqSnpAllele' subroutine # uses the 'OligoGCpercent' subroutine #################### sub SnpFinder { # read in input filename, the consensus sequence string, and # the array of sequences # (pass by reference as passing scalars AND an array) my($filename, $seq_heads, $con, $seqs)=@_; my $query; #to store IUB code of site my $pos; #to hold actual position of site # open OUTPUT SNP data-file # print headers my $s_output = $$filename."_SNPS.txt"; open(SNP, ">$s_output"); print SNP "type\tposition\tIUB\tmaj_allele\tmin_allele\tnum_maj_al\tnum_min_al\tminor_al_freq\tupoligo_maj\tupoligo_min\tdown_oligo\tGC_upmaj\tGC_upmin\tGC_down\t".$$seq_heads."\n"; # loop through consensus bases # NOTE: only checks for polymorphic sites up to 16bp # from the end of the alignment, to ensure that a full # genotyping oligo can be designed from the available data for(my $bp = 0; $bp < (length($$con)-16); ++$bp) { # pull out query base from consensus # and its actual position $query = substr($$con,$bp,1); $pos = $bp + 1; # if site is bi-allelic if($query =~ /R|Y|M|K|S|W/) { # pull out the allele for this site from each sequence my $nucs = ''; for(my $n_seq = 0; $n_seq < scalar @$seqs; ++$n_seq) { $nucs .= substr(@$seqs[$n_seq],$bp,1)."\t"; } # pull out the two alleles, their counts, and the minor allele freq my($maj,$min,$c_maj,$c_min,$freq) = FreqSnpAllele($query, $nucs); # pull out up- and down-stream oligos from consensus my $up_maj = substr($$con,$bp-15,15).$maj; my $up_min = substr($$con,$bp-15,15).$min; my $down = substr($$con,$bp+1,16); # calculate GC% for each oligo my $gc_upmaj = OligoGCpercent($up_maj); my $gc_upmin = OligoGCpercent($up_min); my $gc_down = OligoGCpercent($down); # concatenate SNP info and print to file my $dat_out = "bi\t$pos\t$query\t$maj\t$min\t$c_maj\t$c_min\t$freq\t$up_maj\t$up_min\t$down\t$gc_upmaj\t$gc_upmin\t$gc_down\t$nucs"; chop $dat_out; #removes extraneous tab from end print SNP "$dat_out\n"; } # if site is tri-allelic elsif($query =~ /H|B|V|D/) { # pull out the allele for this site from each sequence my $nucs = ''; for(my $n_seq = 0; $n_seq < scalar @$seqs; ++$n_seq) { $nucs .= substr(@$seqs[$n_seq],$bp,1)."\t"; } # concatenate SNP info and print to file # NO OLIGOS DESIGNED my $dat_out = "tri\t$pos\t$query\t\t\t\t\t\t\t\t\t\t\t\t$nucs"; chop $dat_out; #removes extraneous tab from end print SNP "$dat_out\n"; } # ignore if site is fixed/shows any InDel polymorphism else {next} } close SNP; } #################### # READ IN: IUB code for biallelic SNP # string of nucleotides at a site (over all sequences) # FUNCTION: finds major and minor allele, the number of each observed, # calculates the minor allele frequency and outputs # major_allele, minor_allele, count_major, count_minor, # minor_al_freq #################### sub FreqSnpAllele { my($iub,$nt_string)=@_; #read in the IUB for the biallelic SNP, #and the string of bases at the SNP my $a = ''; #the (alphabetically) 1st allele my $b = ''; #the (alphabetically) 2st allele my $a_count = 0; #number of (alphabetically) 1st allele my $b_count = 0; #number of (alphabetically) 2nd allele my $freq = 0; #minor allele freq # for each type of biallelic SNP, count number of each allele, if($iub eq 'R') { $a = 'A'; $b = 'G'; $a_count = ($nt_string =~ tr/A//); $b_count = ($nt_string =~ tr/G//); } if($iub eq 'Y') { $a = 'C'; $b = 'T'; $a_count = ($nt_string =~ tr/C//); $b_count = ($nt_string =~ tr/T//); } if($iub eq 'M') { $a = 'A'; $b = 'C'; $a_count = ($nt_string =~ tr/A//); $b_count = ($nt_string =~ tr/C//); } if($iub eq 'K') { $a = 'G'; $b = 'T'; $a_count = ($nt_string =~ tr/G//); $b_count = ($nt_string =~ tr/T//); } if($iub eq 'S') { $a = 'C'; $b = 'G'; $a_count = ($nt_string =~ tr/C//); $b_count = ($nt_string =~ tr/G//); } if($iub eq 'W') { $a = 'A'; $b = 'T'; $a_count = ($nt_string =~ tr/A//); $b_count = ($nt_string =~ tr/T//); } # calc minor allele freq (assuming 'b' is rare) $freq = $b_count/($a_count+$b_count); # return each allele (major first - alphabetically 1st allele # wins ties), count for each allele, and minor allele freq if($a_count >= $b_count) {return $a,$b,$a_count,$b_count,$freq} else {return $b,$a,$b_count,$a_count,(1-$freq)} } #################### # READ IN: oligo seq (string) ripped from consensus # FUNCTION: calculates the GC%, and returns it to 1 decimal place # ignores N's and '-'s # IUB's get the appropriate proportion (i.e. R = A/G, gets 0.5, # and H = A/C/T, gets 0.33) #################### sub OligoGCpercent { my($oligo)=@_; #read in the oligo sequence # calculate relevant oligo length (ie. minus 'N' and '-' sites) my $size = (length $oligo) - ($oligo =~ tr/N-//); # 'C', 'G', and 'S': probability of GC at site = 1 my $num_cg = ($oligo =~ tr/CcGgSs//); # 'R', 'Y', 'M', 'K': probability of GC at site = 1/2 my $num_rymk = ($oligo =~ tr/RrYyMmKk//)*0.5; # 'H' and 'D': probability of GC at site = 1/3 my $num_hd = ($oligo =~ tr/HhDd//)*(1/3); # 'B' and 'V': probability of GC at site = 2/3 my $num_bv = ($oligo =~ tr/BbVv//)*(2/3); # calculate percentage GC # add a very tiny value so that the rounding by 'sprintf' actually works... # and return value rounded to 1dp my $percent = ((($num_cg+$num_rymk+$num_hd+$num_bv)/$size)*100)+0.00001; return(sprintf("%.1f", $percent)); } #################### # READ IN: filename stem of input multi_FASTA # consensus sequence string # array of sequence strings # FUNCTION: create vector of sites showing indel variation (using consensus) # go through each set of adjacent indel-harboring sites, and check # each set is a simple biallelic indel # multi-allelic poly-A tract polymorphisms and microsats, and indels # harboring other segregating sites are labeled 'complex' # for each InDel, output the type (biallelic or complex), the start # position of the InDel, the major and minor alleles (I or D), # the number of major and minor alleles seen, the minor allele frequency # up- and downstream OLA oligos (only for bi-), GC% for each # oligo, and the allelic state for each sequence (I or D or N) # NEEDS: uses the 'OligoGCpercent' subroutine #################### sub InDelFinder { # read in input filename, the consensus sequence string, and # the array of sequences # (pass by reference as passing scalars AND an array) my($filename, $seq_heads, $con, $seqs)=@_; # open OUTPUT INDEL data-file # print headers my $i_output = $$filename."_INDELS.txt"; open(ID, ">$i_output"); print ID "type\tindel_start\tlength\tmaj_allele\tmin_allele\tnum_maj_al\tnum_min_al\tminor_al_freq\tupoligo_maj\tupoligo_min\tdown_oligo\tGC_upmaj\tGC_upmin\tGC_down\t".$$seq_heads."\n"; my @i_code; #binary code: 0 = no indel at site, 1 = indel at site my $i_site = ''; #holds the consensus nt at the site $i_code[0] = 0; #set first element of '$i_code' to zero #so '$i_code' will be 1 element longer than actual alignment #allowing subsequent indel-finding function (as written) to #detect indels starting at very position of alignment # fill in binary array for(my $site = 0; $site < length($$con); ++$site) { $i_site = substr($$con,$site,1); # if site shows indel variation if($i_site =~ /a|c|t|g|r|y|m|k|s|w|h|b|v|d|-/) { $i_code[$site+1] = 1; } else { $i_code[$site+1]= 0; } } my $code = 0; #counts number of sites in indel set my $start = 0; #holds indel start position my $end = 0; #holds indel end position # go through each binary coded sites # starting at '$bp=1' and ending at '$bp <= length($$con)' is correct # as operations within loop act on vector 1 site longer (at start) # than consensus sequence for(my $bp = 1; $bp <= length($$con); ++$bp) { # if present site AND preceding site show no indels, ignore if($i_code[$bp] == 0 && $i_code[$bp-1] == 0) {next} # if present site shows an indel BUT preceding site does not # this is the first indel site in a set, so set the start position if($i_code[$bp] == 1 && $i_code[$bp-1] == 0) { $start = $bp; } # if present site shows an indel irrespective of any other site, # increment $code (indexes number of sites in indel set) if($i_code[$bp] == 1) { ++$code; } # if present site shows no indels BUT preceding site does # then the preceding site was the last indel site in a set if($i_code[$bp] == 0 && $i_code[$bp-1] == 1) { my $id_con = ''; #holds the consensus sequence for a given indel set my $poly_count = 0; #the number of polymorphic sites in an indel set my $bad_id = 0; #set a code to mark a complex indel (1 = complex) my $id_len = 0; #length of indel set my $in_count = 0; #counts the number of insertion alleles my $del_count = 0; #counts the number of deletion alleles my $seq_id = ''; #for 1bp indel: the base over all sequences as a string #for 2+bp indel: the string of bases a the indel for a given sequence my $output_poly = ''; #string will hold whether each input seq is I, D, or N for indel #(seq gets an N when ANY base in IN or DEL region is an N) # if the set holds just a single site (ie. 1bp indel) if($code == 1) { $id_len = 1; #set length of indel $end = $bp-1; # a 1bp indel is simple only if 1nt segregates with the '-' # it is complex if 2+nt segregate $id_con = substr($$con,($start-1),($end-$start+1)); $poly_count = ($id_con =~ tr/rymkswhbvd//); #num poly sites in indel set if($poly_count == 0) { $bad_id = 0; # test each sequence for the base at the 1bp indel; # count the number of '-', and the number of non-N # insertion alleles # simultaneously add D, I, or N, to a string to show the # allele present in each sequence for(my $sseq = 0; $sseq < scalar @$seqs; ++$sseq) { $seq_id = substr(@$seqs[$sseq],$start-1,1); if($seq_id =~ tr/-//) { ++$del_count; $output_poly .= "D\t"; } elsif($seq_id =~ tr/N//) { $output_poly .= "N\t"; } else { ++$in_count; $output_poly .= "I\t"; } } } else { $bad_id = 1; } } # otherwise, if the indel set is at least 2 sites if($code > 1) { $end = $bp-1; # ONE # first way a 2+bp indel set can be complex is if any site segregates # more than 1 non '-' allele $id_con = substr($$con,($start-1),($end-$start+1)); $poly_count = ($id_con =~ tr/rymkswhbvd//); #num poly sites in indel set if($poly_count == 0) {$bad_id = 0} #if zero, indel MAY be good #perform second test else {$bad_id = 1} #if not, indel IS complex # TWO # second way a 2+bp indel set can be complex is if the number of '-' # for any sequence at the indel is not EITHER equal to the length # of the indel, OR equal to zero (eg. a polymorphic microsat may not # show this pattern) # N's not included in indel length for those sequences possessing them $id_len = $end-($start-1); #indel length my $num_hyphen = 0; #holds number of '-' at indel for given sequence my $num_missing = 0; #holds the number of N's at an indel for given sequence for(my $sseq = 0; $sseq < scalar @$seqs; ++$sseq) { $seq_id = substr(@$seqs[$sseq],$start-1,($end-$start+1)); $num_hyphen = ($seq_id =~ tr/-//); $num_missing = ($seq_id =~ tr/N//); # if NO hyphens, or ALL hyphens (ignoring N's), indel is good # otherwise indel is bad if($num_hyphen == 0) { if($num_missing == 0) { ++$in_count; #increment number ins if no missing data only $output_poly .= "I\t"; #output an I for this sequence } if($num_missing > 0) { $output_poly .= "N\t"; #seq has N in inserted seq - output an N } } elsif($num_hyphen == ($id_len - $num_missing)) { if($num_missing == 0) { ++$del_count; #increment number dels if no missing data only $output_poly .= "D\t"; #output a D for this sequence } if($num_missing > 0) { $output_poly .= "N\t"; #seq has N in deleted seq - output an N } } else {$bad_id = 1} } # RESULT # after these two tests, $bad_id will either be 0 (simple biallelic # indel) or 1 (complex indel) if($bad_id == 0) { # oligos, and calculate GC% for each my $up_del = substr($$con,$start-17,16); my $up_ins = substr($$con,$end-16,16); my $down = substr($$con,$end,16); my $gc_updel = OligoGCpercent($up_del); my $gc_upins = OligoGCpercent($up_ins); my $gc_down = OligoGCpercent($down); # if INSERTION is major allele (IN wins ties also) if($in_count >= $del_count) { my $minor_freq = $del_count/($in_count + $del_count); print ID "bi\t$start\t$id_len\tI\tD\t$in_count\t$del_count\t$minor_freq\t$up_ins\t$up_del\t$down\t$gc_upins\t$gc_updel\t$gc_down\t"; chop $output_poly; print ID "$output_poly\n"; } # if DELETION is major allele else { my $minor_freq = $in_count/($in_count + $del_count); print ID "bi\t$start\t$id_len\tD\tI\t$del_count\t$in_count\t$minor_freq\t$up_del\t$up_ins\t$down\t$gc_updel\t$gc_upins\t$gc_down\t"; chop $output_poly; print ID "$output_poly\n"; } } elsif($bad_id == 1) { print ID "complex\t$start\t$id_len\n"; } } } } close ID; }