Browse Source

Merge pull request #61 from tripal/54-validate_fasta

Fix validate fasta
Lacey-Anne Sanderson 6 years ago
parent
commit
1582cffd36
3 changed files with 459 additions and 242 deletions
  1. 18 9
      api/blast_ui.api.inc
  2. 292 233
      composer.lock
  3. 149 0
      tests/ValidateFastaTest.php

+ 18 - 9
api/blast_ui.api.inc

@@ -347,18 +347,27 @@ function validate_fasta_sequence($type, $sequence) {
   //Includes IUPAC codes.
   $fastaSeqRegEx = ($type == 'nucleotide')
                    ? '/^[ATCGNUKMBVSWDYRHatcgnukmbvswdyrh\[\/\]\s\n\r]*$/'
-                   : '/^[ABCDEFGHIKLMNPQRSTUVWYZXabcdefghiklmnpqrstuvwyzx\*\-\s\n\r]*$/';
-  $defRegEx      = '/^>.*(\\n|\\r)(.*)$/sm';
-  if (preg_match($defRegEx, $sequence, $matches)) {
-    if (isset($matches[2]) && $matches[2] != '' && preg_match($fastaSeqRegEx, $matches[2])) {
-      return true;
+                   : '/^[acgturykmswbdhvnxACGTURYKMSWBDHVNX\*\-\s\n\r]*$/';
+  $defRegEx      = '/^>\S.*/';
+
+  // For each line of the sequence.
+  foreach (explode("\n", $sequence) as $line) {
+
+    // Is this a definition line?
+    if ($line[0] == '>') {
+      if (!preg_match($defRegEx, $line)) {
+        return FALSE;
+      }
+    }
+    // Otherwise it's a sequence line
+    else {
+      if (!preg_match($fastaSeqRegEx, $line)) {
+        return FALSE;
+      }
     }
-  }
-  else if ($sequence != '' && preg_match($defRegEx, $sequence)) {
-    return true;
   }
 
-  return false;
+  return TRUE;
 }
 
 /**

File diff suppressed because it is too large
+ 292 - 233
composer.lock


+ 149 - 0
tests/ValidateFastaTest.php

@@ -0,0 +1,149 @@
+<?php
+namespace Tests;
+
+use StatonLab\TripalTestSuite\DBTransaction;
+use StatonLab\TripalTestSuite\TripalTestCase;
+
+class ValidateFastaTest extends TripalTestCase {
+  // Uncomment to auto start and rollback db transactions per test method.
+  // use DBTransaction;
+
+  /**
+   * Test validate_fasta_sequence() detects invalid sequence.
+   *
+   * @dataProvider provideInvalidFasta
+   */
+  public function testInvalidFasta($type, $sequence) {
+    $this->assertFalse(validate_fasta_sequence($type, $sequence),
+      "Failed to detect invalid FASTA sequence.");
+  }
+
+  /**
+   * Test validate_fasta_sequence() works for valid sequence.
+   *
+   * @dataProvider provideValidFasta
+   */
+  public function testValidFasta($type, $sequence) {
+    $this->assertTrue(validate_fasta_sequence($type, $sequence),
+      "Unable to pass valid fasta sequence");
+  }
+
+  /**
+   * Provide invalid fasta entries to test.
+   */
+  public function provideInvalidFasta() {
+    return [
+      ['nucleotide',
+'>good first record
+ATCGACTAGCTACGATCGACTAGCAGTCAGTACTGACGTACTACGATCGACTAGCATGCT
+GCATCGATCGATCGACTATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGACT
+> space before definition
+ATTTGGGTGGTTGGGGTTCCCCCACACCGTGGGGTTTGGCAAAACCCGTGGGGCCACACA
+CACACCCCTGGGGTGTGACCCGTAACGCGAATATGCGCGATATTACGGCGCGCGATATTA'
+      ],
+      ['nucleotide',
+'>Has incorrect characters
+ACGTGCATGCATCGACACTACGACTACGACTACTAGCTCGACTGATCGACATCGATCGAT
+ACGTCAGCTACGGCGCGGCATGATCGAZZZZZZZZZZZZZZZZZZTCGCAACCCCTTTCA'
+      ],
+      ['nucleotide',
+'>Has special characters
+AACGTACGATCGACTAGCTACGATCGATCGACTAGCTAGCATCGATCGATCGATCGATCG
+AATTGTCAACGT%%#^&^&%%$$#@@!#%^&*()@ACACGTAGCTAGGCCAACGTGCAAA'
+      ],
+      ['nucleotide',
+'>Has numbers
+ACGTACGATCGATCGACTAGCTAGCATCGATCGATCGATCAGCATCGATCGATCGACTCG
+ACGTACGATCGATCGA2425345345ACGATCGACTAGCTAGCATCGATCGATCAGTCAG'
+      ],
+      ['protein', 
+'>good first record
+ACGTURYKMSWBDACGTURYKMSWBDHACGTURYKMSWBDHVNXACGTURYKMSWBDHVNXVNXHVNX
+ACGTURYKMSWBDHVNACGTURYKMSWBDHVNACGTURYKMSWBDHVACGTURYKMSWBDHVNXNXXX
+> space before identifier
+ACGTURYKMSWBDHVNACGTURYKMSWBDHVNACGTURYKMSWBDHVACGTURYKMSWBDHVNXNXXX
+ACGTURYKMSWBDHVNACGTACGTURYKMACGTURYKMSWBDHVNXSWBDHVNXURYKMSWBDHVNXX'
+      ],
+      ['protein', 
+'>Has incorrect characters
+ACGTURYKMSWBDACGTURYKMSWBDHVNACGTURYKMSWBDHVNACGTURYKMSWBDHVNXXXHVNX
+ACGTURYKMSWBDHVACGTURYKMSWBZZZZZZZZZZZZZZZZZZDHVNACGTURYKMSWBDHVNXNZ'
+      ],
+      ['protein', 
+'>Has special characters
+ACGTURYKMSACGTURYKMSWBDHVNACGTURYKMSWBDHVNACGTURYKMSWBDHVNXXXWBDHVNX
+ACGTURYKMSWBACGTURYKMSWB%#$%#$%#$^$%^%&^%&*&(*)()(!!#@#$#^DHVNXDHVNX'
+      ],
+      ['protein', 
+'>Has numbers
+ACGTURYKMSWBDHVACGTURYKMSWBDHVNACGTURYKMSWBDHVNACGTURYKMSWBDHVNXXXNX
+ACGTURYKMSWACGTURYKMSWB13212324ACGTURYKMSWBDHVNX324443556DHVNXBDHVNX'
+      ],
+    ];
+  }
+
+  /**
+   * Provide Valid FASTA entries to test.
+   */
+  public function provideValidFasta() {
+    return [
+      ['nucleotide',
+'>identifier description
+ACGACTGTACGCGAGCTACGTACGTAGCATGCATCGATCGATCGATCGATCGATCGATC
+AGCTAGTCAGCATCGATGCATGCATCGACGTAATCGAGCGTAGCGAGCTAGTCATACGT'
+      ],
+      ['nucleotide',
+'>identifier
+ACGTACGATCGATCGATCGATCGATCGATCGACATGCTACGATCGATCGATCGATCGG'
+      ],
+      ['nucleotide',
+'>mutlifasta description
+AGCTAGCATCGATCGATCAGCTAGCATCGATCGATCGACTAGCTAGCATCGATCGATC
+CGATCGATCAGCTAGCTACGATCGATCGATCGATCGACTAGCTACGATCGATCGATCG
+CGATCGACTAGCGTACGATCGATCGATCGATCGATCGATCGATCACGATCAGCTACGT
+>multifasta description
+GCTACGATCGATCAGCTATCGACTATCGACGTATCGATGGAGTCATGCAGTCATGCAG
+ACGCTACGATCAGCGTACGATCGATCGATCGATCGATCGATCGATCGATCGGTGCGTC'
+      ],
+      ['nucleotide',
+'>with spaces in sequence
+ACGATCGATCGATCGATAGCTACGATCGATCGATCGATCAGCTAGCTACGATCGATCA
+ACAGCATCAGCTAG ACGATCAGCTAGCTACGATCGATCG  ACTGATCAGCATGCAT
+AACGTACGATCAGCTAGCATGCAT   ATCGATCAGCTAGCTACGATCGATCGATCAG'
+      ],
+      ['nucleotide',
+'>masked sequence
+acgatcgatcgactagctacgatcgacatcgatcatgAACGTGTGGGGTGTGTGCAa
+cagctagctagcatcgatcgatcgatcagctagcatgctacgagtcagcatcgtgca'
+      ],
+      ['protein',
+'>identier description
+ACGTURYKMSWBDHACGTURYKACGTURYKMSWBACGTURYKNXMSWBDHVNXVNX
+YKMSWACGTURYKMSWBDHVNXBDHVNXACGTURYACGTURYKMSWBDSWBDHATK'
+      ],
+      ['protein',
+'>identifier
+ACGTURYKMSWBACGTURYKMSWBDHVNXDACGTURYKMSWBDHVNXHVACGTURY
+ACGTURYKMSWBDHVNACGTURYKMSWBDACGTURYKMSWBDHVNXHVACGTURXX'
+      ],
+      ['protein',
+'>multifasta description
+ACGTURYKMSWBDHVNACGTURYKMSWBDHVNACGTURYKMSWBDHACGTURYKMSWBDHVNXVNXXX
+ACGTURYKMSWBDACGTURYKMSWBACGTURYKMSWBDHACGTURYKMSWBDHVNXVNXDHVNXHVNX
+>multifasta description
+ACGTURYKMSWBACGTURYKMSWBDHVNACGTURYKMSWBDHVACGTURYKMSWBDHVNXNXXDHVNX
+ACGTURYKMSWBDHVNACGTURYKMSACGTURYKMSWBDACGTURYKMSWBDHVNXHVNXWBDHVNXX'
+      ],
+      ['protein',
+'>with spaces in sequence
+ACGTURYKMSWBDACGTURY KMSWBDHACGTURYKMSWBDH VACGTURYKMSW   BDHVNXNXVNXHVNX
+ ACACGTURYKMSWBDHVGTURY  K MSWBDHVNX ACGTURYKMSWBDH  VACGTURYKMSWBDHVNXNX'
+      ],
+      ['protein',
+'>masked sequence
+acgturykmswbdhvacgturykmswbdhvacgturykmswbdhvnacgturykmswbdhAXNvnxxnxnx
+acgturykmswbacgturykmswbdhvnxacgtuTTUrykmswbdhvnxdacgturykmswbdhvnxhvnx'
+      ]
+    ];
+  } 
+}

Some files were not shown because too many files changed in this diff