Browse Source

Uncommented FASTA loading

Stephen Ficklin 4 years ago
parent
commit
935339fe50
1 changed files with 10 additions and 91 deletions
  1. 10 91
      tripal_chado/includes/TripalImporter/GFF3Importer.inc

+ 10 - 91
tripal_chado/includes/TripalImporter/GFF3Importer.inc

@@ -613,25 +613,25 @@ class GFF3Importer extends TripalImporter {
     $this->logMessage("Step 15: Loading feature cross references...              ");
     $this->insertFeatureDbxrefs();
 
-    $this->logMessage("Step 16: Loading feature ontology terms...                 ");
+    $this->logMessage("Step 16: Loading feature ontology terms...                ");
     $this->insertFeatureCVterms();
 
     $this->logMessage("Step 17: Associate child-parent relationships...          ");
     $this->findChildRanks();
     $this->insertFeatureParents();
 
-    $this->logMessage("Step 18: Loading 'derives_from' relationships...           ");
+    $this->logMessage("Step 18: Loading 'derives_from' relationships...          ");
     $this->insertFeatureDerivesFrom();
 
+    $this->logMessage("Step 19: Loading Targets...                               ");
     // TODO: Target (target_organism & target_type)
 
+    $this->logMessage("Step 20: Adding any missing proteins...                   ");
     // TODO: protein records.
 
-    // TODO: Don't import FASTA into memory, use indexes in files.
-
     if (!empty($this->residue_index)) {
-      $this->logMessage("Step 9: Loading residues from FASTA...                  ");
-      //$this->loadFastas();
+      $this->logMessage("Step 21: Adding sequences if available...                 ");
+      $this->insertFeatureSeqs();
     }
   }
 
@@ -1077,7 +1077,7 @@ class GFF3Importer extends TripalImporter {
   /**
    * Loads the actual residue information from the FASTA section of the file.
    */
-  private function loadFastas() {
+  private function insertFeatureSeqs() {
 
     $num_residues = count(array_keys($this->residue_index));
 
@@ -1125,13 +1125,11 @@ class GFF3Importer extends TripalImporter {
         $this->landmarks[$uniquename] = $feature;
       }
 
-      chado_update_record('feature', array(
-        'feature_id' => $id,
-      ), array(
+      chado_update_record('feature', ['feature_id' => $id], [
         'residues' => $residues,
         'seqlen' => strlen($residues),
         'md5checksum' => md5($residues),
-      ));
+      ]);
 
       $count++;
       $this->setItemsHandled($count);
@@ -1421,7 +1419,7 @@ class GFF3Importer extends TripalImporter {
       // Only do an insert if this feature doesn't already exist in the databse.
       if (!$feature['feature_id'] and !$feature['skipped']) {
         $i++;
-        $residues = '';//$this->getResidues($feature, FALSE);
+        $residues = $this->getResidues($feature, FALSE);
         $type_id = $this->feature_cvterm_lookup[strtolower($feature['type'])];
         $sql .= "(:uniquename_$i, :name_$i, :type_id_$i, :organism_id_$i, :residues_$i, " .
                " :md5checksum_$i, :seqlen_$i, FALSE, FALSE),\n";
@@ -2538,85 +2536,6 @@ class GFF3Importer extends TripalImporter {
     return $feature;
   }
 
-  /**
-   * Load the FASTA sequences at the bottom of a GFF3 file
-   *
-   * @param $fh
-   * @param $interval
-   * @param $num_read
-   * @param $line_num
-   * @param $filesize
-   *
-   * @ingroup gff3_loader
-   */
-  private function loadFasta($fh, $interval, &$num_read, &$line_num, $filesize) {
-    $this->logMessage("Loading FASTA sequences...");
-    $residues = '';
-    $id = NULL;
-
-    // iterate through the remaining lines of the file
-    while ($line = fgets($fh)) {
-
-      $line_num++;
-      $size = drupal_strlen($line);
-      $this->addItemsHandled($size);
-      $num_read += $size;
-      $line = trim($line);
-
-      // if we encounter a definition line then get the name, uniquename,
-      // accession and relationship subject from the definition line
-      if (preg_match('/^>/', $line)) {
-
-        // if we are beginning a new sequence then save to the database the last one we just finished.
-        if ($id) {
-          $values = ['uniquename' => $id];
-          $result = chado_select_record('tripal_gff_temp', ['*'], $values);
-          if (count($result) == 0) {
-            $this->logMessage('Cannot find feature to assign FASTA sequence: %uname.',
-              ['%uname' => $id], TRIPAL_WARNING);
-          }
-          else {
-            // if we have a feature then add the residues
-            $feature = $result[0];
-            $values = [
-              'residues' => $residues,
-              'seqlen' => strlen($residues),
-            ];
-            $match = ['feature_id' => $feature->feature_id];
-            chado_update_record('feature', $match, $values);
-          }
-        }
-
-        // get the feature ID for this ID from the tripal_gff_temp table. It
-        // should be the name up to the first space
-        $id = preg_replace('/^>([^\s]+).*$/', '\1', $line);
-        $residues = '';
-      }
-      else {
-        $residues .= trim($line);
-      }
-    }
-
-    // add in the last sequence
-    $values = ['uniquename' => $id];
-    $result = chado_select_record('tripal_gff_temp', ['*'], $values);
-    if (count($result) == 0) {
-      $this->logMessage('Cannot find feature to assign FASTA sequence: %uname.',
-        ['%uname' => $id], TRIPAL_WARNING);
-    }
-    else {
-      // if we have a feature then add the residues
-      $feature = $result[0];
-      $values = [
-        'residues' => $residues,
-        'seqlen' => strlen($residues),
-      ];
-      $match = ['feature_id' => $feature->feature_id];
-      chado_update_record('feature', $match, $values);
-    }
-
-  }
-
   /**
    * Load the target attribute of a gff3 record
    *