#!/usr/bin/env perl use strict; use warnings; use FindBin; use lib ("$FindBin::RealBin/../PerlLib"); use Fasta_reader; use BHStats; my $usage = "\n\nusage: $0 corset.fasta\n\n"; my $fasta_file = $ARGV[0] or die $usage; main: { my $fasta_reader = new Fasta_reader($fasta_file); my @all_seq_lengths; my $number_transcripts = 0; my $num_GC = 0; my %component_to_longest_isoform; my $tot_seq_len = 0; my $missing_gene_ids_flag = 0; while (my $seq_obj = $fasta_reader->next()) { my $acc = $seq_obj->get_accession(); $number_transcripts++; my $comp_id = $acc; if (! $missing_gene_ids_flag) { if ($acc =~ /^(Cluster-\d+.\d+)_/) { $comp_id = $1; } else { print STDERR "Error, cannot decipher gene identifier from acc: $acc"; $missing_gene_ids_flag = 1; } } my $sequence = $seq_obj->get_sequence(); my $seq_len = length($sequence); $tot_seq_len += $seq_len; if ( (! exists $component_to_longest_isoform{$comp_id}) || $component_to_longest_isoform{$comp_id} < $seq_len) { $component_to_longest_isoform{$comp_id} = $seq_len; } push (@all_seq_lengths, $seq_len); while ($sequence =~ /[gc]/ig) { $num_GC++; } } print "\n\n"; print "################################\n"; print "## Counts of transcripts, etc.\n"; print "################################\n"; print "Total corset 'clusters':\t" . scalar(keys %component_to_longest_isoform) . "\n"; print "Total trinity transcripts:\t" . $number_transcripts . "\n"; my $pct_gc = sprintf("%.2f", $num_GC / $tot_seq_len * 100); print "Percent GC: $pct_gc\n\n"; print "########################################\n"; print "Stats based on ALL transcript contigs:\n"; print "########################################\n\n"; &report_stats(@all_seq_lengths); print "\n\n"; if ($missing_gene_ids_flag) { print " - note: not reporting gene-based longest isoform info since couldn't parse Trinity accession info.\n"; } else { print "#####################################################\n"; print "## Stats based on ONLY LONGEST ISOFORM per 'GENE':\n"; print "#####################################################\n\n"; &report_stats(values %component_to_longest_isoform); print "\n\n\n"; } exit(0); } #### sub report_stats { my (@seq_lengths) = @_; @seq_lengths = reverse sort {$a<=>$b} @seq_lengths; my $cum_seq_len = 0; foreach my $len (@seq_lengths) { $cum_seq_len += $len; } for (my $i = 10; $i <= 50; $i += 10) { my $cum_len_needed = $cum_seq_len * $i/100; my $N_val = &get_contigNvalue($cum_len_needed, \@seq_lengths); print "\tContig N$i: $N_val\n"; } print "\n"; my $median_len = &BHStats::median(@seq_lengths); print "\tMedian contig length: $median_len\n"; my $avg_len = sprintf("%.2f", &BHStats::avg(@seq_lengths)); print "\tAverage contig: $avg_len\n"; print "\tTotal assembled bases: $cum_seq_len\n"; return; } sub get_contigNvalue { my ($cum_len_needed, $seq_lengths_aref) = @_; my $partial_sum_len = 0; foreach my $len (@$seq_lengths_aref) { $partial_sum_len += $len; if ($partial_sum_len >= $cum_len_needed) { return($len); } } return -1; # shouldn't happen. }