abundance_estimates_to_matrix.pl 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #!/usr/bin/env perl
  2. use strict;
  3. use warnings;
  4. use FindBin;
  5. use File::Basename;
  6. use Getopt::Long qw(:config no_ignore_case bundling pass_through);
  7. my $usage = <<__EOUSAGE__;
  8. ############################################################
  9. #
  10. # Usage: $0 --est_method <method> sample1.results sample2.results ...
  11. #
  12. # or $0 --est_method <method> --quant_files file.listing_target_files.txt
  13. #
  14. # Note, if only a single input file is given, it's expected to contain the paths to all the target abundance estimation files.
  15. #
  16. # Required:
  17. #
  18. # --est_method <string> featureCounts|RSEM|eXpress|kallisto|salmon (needs to know what format to expect)
  19. #
  20. # Options:
  21. #
  22. # --cross_sample_norm <string> TMM|UpperQuartile|none (default: TMM)
  23. #
  24. # --name_sample_by_basedir name sample column by dirname instead of filename
  25. # --basedir_index <int> default(-2)
  26. #
  27. # --out_prefix <string> default: 'matrix'
  28. #
  29. # --quant_files <string> file containing a list of all the target files.
  30. #
  31. ############################################################
  32. __EOUSAGE__
  33. ;
  34. my $help_flag;
  35. my $est_method;
  36. my $val_type;
  37. my $cross_sample_norm = "TMM";
  38. my $name_sample_by_basedir = 0;
  39. my $out_prefix = "matrix";
  40. my $basedir_index = -2;
  41. my $quant_files = "";
  42. &GetOptions('help|h' => \$help_flag,
  43. 'est_method=s' => \$est_method,
  44. 'cross_sample_norm=s' => \$cross_sample_norm,
  45. 'name_sample_by_basedir' => \$name_sample_by_basedir,
  46. 'out_prefix=s' => \$out_prefix,
  47. 'basedir_index=i' => \$basedir_index,
  48. 'quant_files=s' => \$quant_files,
  49. );
  50. unless ($est_method && (@ARGV || $quant_files)) {
  51. die $usage;
  52. }
  53. unless ($est_method =~ /^(featureCounts|RSEM|eXpress|kallisto|salmon)/i) {
  54. die "Error, dont recognize --est_method $est_method ";
  55. }
  56. unless ($cross_sample_norm =~ /^(TMM|UpperQuartile|none)$/i) {
  57. die "Error, dont recognize --cross_sample_norm $cross_sample_norm ";
  58. }
  59. my @files;
  60. if ($quant_files) {
  61. # allow for a file listing the various files.
  62. @files = `cat $quant_files`;
  63. chomp @files;
  64. }
  65. elsif (@ARGV) {
  66. @files = @ARGV;
  67. }
  68. else {
  69. die $usage;
  70. }
  71. =data_formats
  72. ## featureCounts
  73. 0 gene_id
  74. 1 counts
  75. 2 fpkm
  76. 3 tpm
  77. 4 cpm
  78. ## RSEM:
  79. 0 transcript_id
  80. 1 gene_id
  81. 2 length
  82. 3 effective_length
  83. 4 expected_count
  84. 5 TPM
  85. 6 FPKM
  86. 7 IsoPct
  87. ## eXpress v1.5:
  88. 1 target_id
  89. 2 length
  90. 3 eff_length
  91. 4 tot_counts
  92. 5 uniq_counts
  93. 6 est_counts
  94. 7 eff_counts
  95. 8 ambig_distr_alpha
  96. 9 ambig_distr_beta
  97. 10 fpkm
  98. 11 fpkm_conf_low
  99. 12 fpkm_conf_high
  100. 13 solvable
  101. 14 tpm
  102. ## kallisto:
  103. 0 target_id
  104. 1 length
  105. 2 eff_length
  106. 3 est_counts
  107. 4 tpm
  108. ## salmon:
  109. 0 Name
  110. 1 Length
  111. 2 EffectiveLength
  112. 3 TPM
  113. 4 NumReads
  114. =cut
  115. ;
  116. my ($acc_field, $counts_field, $fpkm_field, $tpm_field, $cpm_field);
  117. if ($est_method =~ /^rsem$/i) {
  118. $acc_field = 0;
  119. $counts_field = "expected_count";
  120. $fpkm_field = "FPKM";
  121. $tpm_field = "TPM";
  122. }
  123. elsif ($est_method =~ /^express$/i) { # as of v1.5
  124. $acc_field = "target_id";
  125. $counts_field = "eff_counts";
  126. $fpkm_field = "fpkm";
  127. $tpm_field = "tpm";
  128. }
  129. elsif ($est_method =~ /^kallisto$/i) {
  130. $acc_field = "target_id";
  131. $counts_field = "est_counts";
  132. $fpkm_field = "tpm";
  133. $tpm_field = "tpm";
  134. }
  135. elsif ($est_method =~ /^salmon/) {
  136. $acc_field = "Name";
  137. $counts_field = "NumReads";
  138. $fpkm_field = "TPM";
  139. $tpm_field = "TPM";
  140. }
  141. elsif ($est_method =~ /^featureCounts/) {
  142. $acc_field = "gene_id";
  143. $counts_field = "counts";
  144. $fpkm_field = "fpkm";
  145. $tpm_field = "tpm";
  146. $cpm_field = "cpm";
  147. }
  148. else {
  149. die "Error, dont recognize --est_method [$est_method] ";
  150. }
  151. main: {
  152. my %data;
  153. foreach my $file (@files) {
  154. print STDERR "-reading file: $file\n";
  155. open (my $fh, $file) or die "Error, cannot open file $file";
  156. my $header = <$fh>;
  157. chomp $header;
  158. my %fields = &parse_field_positions($header);
  159. #use Data::Dumper; print STDERR Dumper(\%fields);
  160. while (<$fh>) {
  161. chomp;
  162. my @x = split(/\t/);
  163. my $acc = $x[ $fields{$acc_field} ];
  164. my $count = $x[ $fields{$counts_field} ];
  165. my $fpkm = $x[ $fields{$fpkm_field} ];
  166. my $tpm = $x[ $fields{$tpm_field} ];
  167. my $cpm = $x[ $fields{$cpm_field} ];
  168. $data{$acc}->{$file}->{count} = $count;
  169. $data{$acc}->{$file}->{FPKM} = $fpkm;
  170. $data{$acc}->{$file}->{TPM} = $tpm;
  171. $data{$acc}->{$file}->{CPM} = $cpm;
  172. }
  173. close $fh;
  174. }
  175. my @filenames = @files;
  176. foreach my $file (@filenames) {
  177. if ($name_sample_by_basedir) {
  178. my @path = split(m|/|, $file);
  179. $file = $path[$basedir_index];
  180. }
  181. else {
  182. $file = basename($file);
  183. }
  184. }
  185. print STDERR "\n\n* Outputting combined matrix.\n\n";
  186. my $counts_matrix_file = "$out_prefix.counts.matrix";
  187. my $TPM_matrix_file = "$out_prefix.TPM.not_cross_norm";
  188. my $CPM_matrix_file = "$out_prefix.CPM.not_cross_norm";
  189. open (my $ofh_counts, ">$counts_matrix_file") or die "Error, cannot write file $counts_matrix_file";
  190. open (my $ofh_TPM, ">$TPM_matrix_file") or die "Error, cannot write file $TPM_matrix_file";
  191. open (my $ofh_CPM, ">$CPM_matrix_file") or die "Error, cannot write file $CPM_matrix_file";
  192. { # check to see if they're unique
  193. my %filename_map = map { + $_ => 1 } @filenames;
  194. if (scalar keys %filename_map != scalar @filenames) {
  195. die "Error, the column headings: @filenames are not unique. Should you consider using the --name_sample_by_basedir parameter?";
  196. }
  197. }
  198. # clean up matrix headers
  199. foreach my $file (@filenames) {
  200. # also, get rid of the part of the filename that RSEM adds
  201. $file =~ s/\.(genes|isoforms)\.results$//;
  202. # featureCounts
  203. $file =~ s/\.count$//;
  204. }
  205. print $ofh_counts join("\t", "", @filenames) . "\n";
  206. print $ofh_TPM join("\t", "", @filenames) . "\n";
  207. print $ofh_CPM join("\t", "", @filenames) . "\n";
  208. foreach my $acc (keys %data) {
  209. print $ofh_counts "$acc";
  210. print $ofh_TPM "$acc";
  211. print $ofh_CPM "$acc";
  212. foreach my $file (@files) {
  213. my $count = $data{$acc}->{$file}->{count};
  214. unless (defined $count) {
  215. $count = "NA";
  216. }
  217. my $tpm = $data{$acc}->{$file}->{TPM};
  218. if (defined $tpm) {
  219. $tpm = $tpm/1;
  220. }
  221. else {
  222. $tpm = "NA";
  223. }
  224. my $cpm = $data{$acc}->{$file}->{CPM};
  225. if (defined $cpm) {
  226. $cpm = $cpm/1;
  227. }
  228. else {
  229. $cpm = "NA";
  230. }
  231. print $ofh_counts "\t$count";
  232. print $ofh_TPM "\t$tpm";
  233. print $ofh_CPM "\t$cpm";
  234. }
  235. print $ofh_counts "\n";
  236. print $ofh_TPM "\n";
  237. print $ofh_CPM "\n";
  238. }
  239. close $ofh_counts;
  240. close $ofh_TPM;
  241. close $ofh_CPM;
  242. if (scalar @files > 1) {
  243. ## more than one sample
  244. if ($cross_sample_norm =~ /^TMM$/i) {
  245. my $cmd = "$FindBin::RealBin/support_scripts/run_TMM_scale_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.TPM.matrix";
  246. &process_cmd($cmd);
  247. $cmd = "$FindBin::RealBin/support_scripts/run_TMM_scale_matrix.pl --matrix $CPM_matrix_file > $out_prefix.$cross_sample_norm.CPM.matrix";
  248. &process_cmd($cmd);
  249. }
  250. elsif ($cross_sample_norm =~ /^UpperQuartile$/) {
  251. my $cmd = "$FindBin::RealBin/support_scripts/run_UpperQuartileNormalization_matrix.pl --matrix $TPM_matrix_file > $out_prefix.$cross_sample_norm.TPM.matrix";
  252. &process_cmd($cmd);
  253. $cmd = "$FindBin::RealBin/support_scripts/run_UpperQuartileNormalization_matrix.pl --matrix $CPM_matrix_file > $out_prefix.$cross_sample_norm.CPM.matrix";
  254. &process_cmd($cmd);
  255. }
  256. elsif ($cross_sample_norm =~ /^none$/i) {
  257. print STDERR "-not performing cross-sample normalization.\n";
  258. }
  259. }
  260. else {
  261. unless (scalar @files == 1) {
  262. die "Error, no target samples. Shouldn't get here.";
  263. }
  264. print STDERR "Warning, only one sample, so not performing cross-sample normalization\n";
  265. }
  266. print STDERR "Done.\n\n";
  267. exit(0);
  268. }
  269. ####
  270. sub process_cmd {
  271. my ($cmd) = @_;
  272. print STDERR $cmd;
  273. my $ret = system($cmd);
  274. if ($ret) {
  275. die "Error, CMD: $cmd died with ret $ret";
  276. }
  277. return;
  278. }
  279. ####
  280. sub parse_field_positions {
  281. my ($header) = @_;
  282. my %field_pos;
  283. my @fields = split(/\s+/, $header);
  284. for (my $i = 0; $i <= $#fields; $i++) {
  285. $field_pos{$fields[$i]} = $i;
  286. $field_pos{$i} = $i; # for fixed column assignment
  287. }
  288. return(%field_pos);
  289. }