Fasta_reader.pm 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #!/usr/local/bin/perl -w
  2. # lightweight fasta reader capabilities:
  3. package Fasta_reader;
  4. use strict;
  5. use warnings;
  6. use Carp;
  7. sub new {
  8. my ($packagename, $fastaFile) = @_;
  9. ## note: fastaFile can be a filename or an IO::Handle
  10. my $self = { fastaFile => undef,,
  11. fileHandle => undef };
  12. bless ($self, $packagename);
  13. ## create filehandle
  14. my $filehandle = undef;
  15. if (ref $fastaFile eq 'IO::Handle') {
  16. $filehandle = $fastaFile;
  17. }
  18. else {
  19. if ($fastaFile =~ /\.gz$/) {
  20. open ($filehandle, "gunzip -c $fastaFile | ") or confess "Error, cannot open file $fastaFile using 'gunzip -c'";
  21. }
  22. else {
  23. open ($filehandle, $fastaFile) or die "Error: Couldn't open $fastaFile\n";
  24. }
  25. $self->{fastaFile} = $fastaFile;
  26. }
  27. $self->{fileHandle} = $filehandle;
  28. return ($self);
  29. }
  30. #### next() fetches next Sequence object.
  31. sub next {
  32. my $self = shift;
  33. my $orig_record_sep = $/;
  34. $/="\n>";
  35. my $filehandle = $self->{fileHandle};
  36. my $next_text_input = <$filehandle>;
  37. if (defined($next_text_input) && $next_text_input !~ /\w/) {
  38. ## must have been some whitespace at start of fasta file, before first entry.
  39. ## try again:
  40. $next_text_input = <$filehandle>;
  41. }
  42. my $seqobj = undef;
  43. if ($next_text_input) {
  44. $next_text_input =~ s/^>|>$//g; #remove trailing > char.
  45. $next_text_input =~ tr/\t\n\000-\037\177-\377/\t\n/d; #remove cntrl chars
  46. my ($header, @seqlines) = split (/\n/, $next_text_input);
  47. my $sequence = join ("", @seqlines);
  48. $sequence =~ s/\s//g;
  49. $seqobj = Sequence->new($header, $sequence);
  50. }
  51. $/ = $orig_record_sep; #reset the record separator to original setting.
  52. return ($seqobj); #returns null if not instantiated.
  53. }
  54. #### finish() closes the open filehandle to the query database.
  55. sub finish {
  56. my $self = shift;
  57. my $filehandle = $self->{fileHandle};
  58. close $filehandle;
  59. $self->{fileHandle} = undef;
  60. }
  61. ####
  62. sub retrieve_all_seqs_hash {
  63. my $self = shift;
  64. my %acc_to_seq;
  65. while (my $seq_obj = $self->next()) {
  66. my $acc = $seq_obj->get_accession();
  67. my $sequence = $seq_obj->get_sequence();
  68. $acc_to_seq{$acc} = $sequence;
  69. }
  70. return(%acc_to_seq);
  71. }
  72. ##############################################
  73. package Sequence;
  74. use strict;
  75. sub new {
  76. my ($packagename, $header, $sequence) = @_;
  77. ## extract an accession from the header:
  78. my ($acc, $rest) = split (/\s+/, $header, 2);
  79. my $self = { accession => $acc,
  80. header => $header,
  81. sequence => $sequence,
  82. filename => undef };
  83. bless ($self, $packagename);
  84. return ($self);
  85. }
  86. ####
  87. sub get_accession {
  88. my $self = shift;
  89. return ($self->{accession});
  90. }
  91. ####
  92. sub get_header {
  93. my $self = shift;
  94. return ($self->{header});
  95. }
  96. ####
  97. sub get_sequence {
  98. my $self = shift;
  99. return ($self->{sequence});
  100. }
  101. ####
  102. sub get_FASTA_format {
  103. my $self = shift;
  104. my %settings = @_;
  105. my $fasta_line_len = $settings{fasta_line_len} || 60;
  106. my $header = $self->get_header();
  107. my $sequence = $self->get_sequence();
  108. if ($fasta_line_len > 0) {
  109. $sequence =~ s/(\S{$fasta_line_len})/$1\n/g;
  110. chomp $sequence;
  111. }
  112. my $fasta_entry = ">$header\n$sequence\n";
  113. return ($fasta_entry);
  114. }
  115. ####
  116. sub write_fasta_file {
  117. my $self = shift;
  118. my $filename = shift;
  119. my ($accession, $header, $sequence) = ($self->{accession}, $self->{header}, $self->{sequence});
  120. my $fasta_entry = $self->get_FASTA_format();
  121. my $tempfile;
  122. if ($filename) {
  123. $tempfile = $filename;
  124. } else {
  125. my $acc = $accession;
  126. $acc =~ s/\W/_/g;
  127. $tempfile = "$acc.fasta";
  128. }
  129. open (TMP, ">$tempfile") or die "ERROR! Couldn't write a temporary file in current directory.\n";
  130. print TMP $fasta_entry;
  131. close TMP;
  132. return ($tempfile);
  133. }
  134. ####
  135. sub get_core_read_name {
  136. my $self = shift;
  137. my $acc = $self->get_accession();
  138. $acc =~ s|/[12]$||;
  139. return($acc);
  140. }
  141. 1; #EOM