CorsetStats.pl 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. #!/usr/bin/env perl
  2. use strict;
  3. use warnings;
  4. use FindBin;
  5. use lib ("$FindBin::RealBin/../PerlLib");
  6. use Fasta_reader;
  7. use BHStats;
  8. my $usage = "\n\nusage: $0 corset.fasta\n\n";
  9. my $fasta_file = $ARGV[0] or die $usage;
  10. main: {
  11. my $fasta_reader = new Fasta_reader($fasta_file);
  12. my @all_seq_lengths;
  13. my $number_transcripts = 0;
  14. my $num_GC = 0;
  15. my %component_to_longest_isoform;
  16. my $tot_seq_len = 0;
  17. my $missing_gene_ids_flag = 0;
  18. while (my $seq_obj = $fasta_reader->next()) {
  19. my $acc = $seq_obj->get_accession();
  20. $number_transcripts++;
  21. my $comp_id = $acc;
  22. if (! $missing_gene_ids_flag) {
  23. if ($acc =~ /^(Cluster-\d+.\d+)_/) {
  24. $comp_id = $1;
  25. }
  26. else {
  27. print STDERR "Error, cannot decipher gene identifier from acc: $acc";
  28. $missing_gene_ids_flag = 1;
  29. }
  30. }
  31. my $sequence = $seq_obj->get_sequence();
  32. my $seq_len = length($sequence);
  33. $tot_seq_len += $seq_len;
  34. if ( (! exists $component_to_longest_isoform{$comp_id})
  35. ||
  36. $component_to_longest_isoform{$comp_id} < $seq_len) {
  37. $component_to_longest_isoform{$comp_id} = $seq_len;
  38. }
  39. push (@all_seq_lengths, $seq_len);
  40. while ($sequence =~ /[gc]/ig) {
  41. $num_GC++;
  42. }
  43. }
  44. print "\n\n";
  45. print "################################\n";
  46. print "## Counts of transcripts, etc.\n";
  47. print "################################\n";
  48. print "Total corset 'clusters':\t" . scalar(keys %component_to_longest_isoform) . "\n";
  49. print "Total trinity transcripts:\t" . $number_transcripts . "\n";
  50. my $pct_gc = sprintf("%.2f", $num_GC / $tot_seq_len * 100);
  51. print "Percent GC: $pct_gc\n\n";
  52. print "########################################\n";
  53. print "Stats based on ALL transcript contigs:\n";
  54. print "########################################\n\n";
  55. &report_stats(@all_seq_lengths);
  56. print "\n\n";
  57. if ($missing_gene_ids_flag) {
  58. print " - note: not reporting gene-based longest isoform info since couldn't parse Trinity accession info.\n";
  59. }
  60. else {
  61. print "#####################################################\n";
  62. print "## Stats based on ONLY LONGEST ISOFORM per 'GENE':\n";
  63. print "#####################################################\n\n";
  64. &report_stats(values %component_to_longest_isoform);
  65. print "\n\n\n";
  66. }
  67. exit(0);
  68. }
  69. ####
  70. sub report_stats {
  71. my (@seq_lengths) = @_;
  72. @seq_lengths = reverse sort {$a<=>$b} @seq_lengths;
  73. my $cum_seq_len = 0;
  74. foreach my $len (@seq_lengths) {
  75. $cum_seq_len += $len;
  76. }
  77. for (my $i = 10; $i <= 50; $i += 10) {
  78. my $cum_len_needed = $cum_seq_len * $i/100;
  79. my $N_val = &get_contigNvalue($cum_len_needed, \@seq_lengths);
  80. print "\tContig N$i: $N_val\n";
  81. }
  82. print "\n";
  83. my $median_len = &BHStats::median(@seq_lengths);
  84. print "\tMedian contig length: $median_len\n";
  85. my $avg_len = sprintf("%.2f", &BHStats::avg(@seq_lengths));
  86. print "\tAverage contig: $avg_len\n";
  87. print "\tTotal assembled bases: $cum_seq_len\n";
  88. return;
  89. }
  90. sub get_contigNvalue {
  91. my ($cum_len_needed, $seq_lengths_aref) = @_;
  92. my $partial_sum_len = 0;
  93. foreach my $len (@$seq_lengths_aref) {
  94. $partial_sum_len += $len;
  95. if ($partial_sum_len >= $cum_len_needed) {
  96. return($len);
  97. }
  98. }
  99. return -1; # shouldn't happen.
  100. }