| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509 | 
							- <?php
 
- /**
 
-  * @defgroup fasta_loader FASTA Feature Loader
 
-  * @{
 
-  * Provides fasta loading functionality. Creates features based on their specification in a fasta file.
 
-  * @}
 
-  * @ingroup tripal_feature
 
-  */
 
-  
 
- /**
 
-  *
 
-  *
 
-  * @ingroup fasta_loader
 
-  */
 
- function tripal_feature_fasta_load_form (){
 
-    $form['fasta_file']= array(
 
-       '#type'          => 'textfield',
 
-       '#title'         => t('FASTA File'),
 
-       '#description'   => t('Please enter the full system path for the FASTA file, or a path within the Drupal
 
-                              installation (e.g. /sites/default/files/xyz.obo).  The path must be accessible to the
 
-                              server on which this Drupal instance is running.'),
 
-       '#required' => TRUE,
 
-       '#weight'        => 1
 
-    );
 
-    // get the list of organisms
 
-    $sql = "SELECT * FROM {organism} ORDER BY genus, species";
 
-    $previous_db = tripal_db_set_active('chado');  // use chado database
 
-    $org_rset = db_query($sql);
 
-    tripal_db_set_active($previous_db);  // now use drupal database
 
-    $organisms = array();
 
-    $organisms[''] = '';
 
-    while($organism = db_fetch_object($org_rset)){
 
-       $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
 
-    }
 
-    $form['organism_id'] = array (
 
-      '#title'       => t('Organism'),
 
-      '#type'        => t('select'),
 
-      '#description' => t("Choose the organism to which these sequences are associated "),
 
-      '#required'    => TRUE,
 
-      '#options'     => $organisms,
 
-      '#weight'      => 2,
 
-    );
 
-    $form['type']= array(
 
-       '#type' => 'textfield',
 
-       '#title' => t('Sequence Type'),
 
-       '#required' => TRUE,
 
-       '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the FASTA file.'),
 
-       '#weight' => 3
 
-    );
 
-    // get the list of organisms
 
-    $sql = "SELECT L.library_id, L.name, CVT.name as type
 
-            FROM {library} L
 
-               INNER JOIN {cvterm} CVT ON L.type_id = CVT.cvterm_id
 
-            ORDER BY name";
 
-    $previous_db = tripal_db_set_active('chado');  // use chado database
 
-    $lib_rset = db_query($sql);
 
-    tripal_db_set_active($previous_db);  // now use drupal database
 
-    $libraries = array();
 
-    $libraries[''] = '';
 
-    while($library = db_fetch_object($lib_rset)){
 
-       $libraries[$library->library_id] = "$library->name ($library->type)";
 
-    }
 
- //   $form['library_id'] = array (
 
- //     '#title'       => t('Library'),
 
- //     '#type'        => t('select'),
 
- //     '#description' => t("Choose the library to which these sequences are associated "),
 
- //     '#required'    => FALSE,
 
- //     '#options'     => $libraries,
 
- //     '#weight'      => 5,
 
- //   );
 
-    $form['update']= array(
 
-       '#type' => 'checkbox',
 
-       '#title' => t('Insert and update'),
 
-       '#required' => FALSE,
 
-       '#description' => t('By default only new features are inserted.  Select this checkbox to update
 
-                            features that already exists with the contents from the FASTA file.'),
 
-       '#weight' => 6
 
-    );
 
-    // Advanced Options
 
-    $form['advanced'] = array(
 
-       '#type' => 'fieldset',
 
-       '#title' => t('Advanced Options'),
 
-       '#weight'=> 7,
 
-       '#collapsed' => TRUE
 
-    );
 
-    $form['advanced']['re_help']= array(
 
-       '#type' => 'item',
 
-       '#value' => t('A regular expression is an advanced method for extracting information from a string of text.  
 
-                      By default, this loader will use the first word in the definition line for each sequence in the FASTA file
 
-                      as the uniquename for the sequences.  If this is not desired, you may use the following regular 
 
-                      expressions to define the postions of the unique name.'),
 
-       '#weight' => 0
 
-    );
 
-    $form['advanced']['re_name']= array(
 
-       '#type' => 'textfield',
 
-       '#title' => t('Regular expression for the name'),
 
-       '#required' => FALSE,
 
-       '#description' => t('Enter the regular expression that will extract the feature name from the FASTA definition line. For example, for a defintion line with a name and uniquename separated by a bar \'|\' (>seqname|uniquename), the regular expression would be, "^(.*?)\|.*$"'),
 
-       '#weight' => 1
 
-    );   
 
-    $form['advanced']['re_uname']= array(
 
-       '#type' => 'textfield',
 
-       '#title' => t('Regular expression for the unique name'),
 
-       '#required' => FALSE,
 
-       '#description' => t('Enter the regular expression that will extract the unique feature name for each feature from the FASTA definition line.  This name must be unique for the organism.'),
 
-       '#weight' => 2
 
-    );   
 
-    // Advanced database cross-reference optoins
 
-    $form['advanced']['db'] = array(
 
-       '#type' => 'fieldset',
 
-       '#title' => t('External Database Reference'),
 
-       '#weight'=> 6,
 
-       '#collapsed' => TRUE
 
-    );
 
-    $form['advanced']['db']['re_accession']= array(
 
-       '#type' => 'textfield',
 
-       '#title' => t('Regular expression for the accession'),
 
-       '#required' => FALSE,
 
-       '#description' => t('Enter the regular expression that will extract the accession for the external database for each feature from the FASTA definition line.'),
 
-       '#weight' => 2
 
-    ); 
 
-   // get the list of databases
 
-    $sql = "SELECT * FROM {db} ORDER BY name";
 
-    $previous_db = tripal_db_set_active('chado');  // use chado database
 
-    $db_rset = db_query($sql);
 
-    tripal_db_set_active($previous_db);  // now use drupal database
 
-    $dbs = array();
 
-    $dbs[''] = '';
 
-    while($db = db_fetch_object($db_rset)){
 
-       $dbs[$db->db_id] = "$db->name";
 
-    }
 
-    $form['advanced']['db']['db_id'] = array (
 
-      '#title'       => t('External Database'),
 
-      '#type'        => t('select'),
 
-      '#description' => t("Plese choose an external database for which these sequences have a cross reference."),
 
-      '#required'    => FALSE,
 
-      '#options'     => $dbs,
 
-      '#weight'      => 1,
 
-    );
 
-    $form['advanced']['relationship'] = array(
 
-       '#type' => 'fieldset',
 
-       '#title' => t('Relationships'),
 
-       '#weight'=> 6,
 
-       '#collapsed' => TRUE
 
-    );
 
-    $rels = array();
 
-    $rels[''] = '';
 
-    $rels['part_of'] = 'part of';
 
-    $rels['derives_from'] = 'produced by';
 
-    // Advanced references options
 
-    $form['advanced']['relationship']['rel_type']= array(
 
-      '#title'       => t('Relationship Type'),
 
-      '#type'        => t('select'),
 
-      '#description' => t("Use this option to create associations, or relationships between the 
 
-                           features of this FASTA file and existing features in the database. For 
 
-                           example, to associate a FASTA file of peptides to existing genes or transcript sequence, 
 
-                           select the type 'produced by'. For a CDS sequences select the type 'part of'"),
 
-      '#required'    => FALSE,
 
-      '#options'     => $rels,
 
-      '#weight'      => 5,
 
-    );
 
-    $form['advanced']['relationship']['re_subject']= array(
 
-       '#type' => 'textfield',
 
-       '#title' => t('Regular expression for the parent'),
 
-       '#required' => FALSE,
 
-       '#description' => t('Enter the regular expression that will extract the unique 
 
-                            name needed to identify the existing sequence for which the 
 
-                            relationship type selected above will apply.'),
 
-       '#weight' => 6
 
-    ); 
 
-    $form['advanced']['relationship']['parent_type']= array(
 
-       '#type' => 'textfield',
 
-       '#title' => t('Parent Type'),
 
-       '#required' => FALSE,
 
-       '#description' => t('Please enter the Sequence Ontology term for the parent.  For example
 
-                            if the FASTA file being loaded is a set of proteins that are 
 
-                            products of genes, then use the SO term \'gene\' or \'transcript\' or equivalent. However,
 
-                            this type must match the type for already loaded features.'),
 
-       '#weight' => 7
 
-    );
 
-    $form['button'] = array(
 
-       '#type' => 'submit',
 
-       '#value' => t('Import FASTA file'),
 
-       '#weight' => 10,
 
-    );
 
-    return $form;   
 
- }
 
- /**
 
-  *
 
-  *
 
-  * @ingroup fasta_loader
 
-  */
 
- function tripal_feature_fasta_load_form_validate($form, &$form_state){
 
-    $fasta_file = trim($form_state['values']['fasta_file']);
 
-    $organism_id  = $form_state['values']['organism_id'];
 
-    $type         = trim($form_state['values']['type']);
 
-    $update       = trim($form_state['values']['update']);
 
-    $library_id   = $form_state['values']['library_id'];
 
-    $re_name      = trim($form_state['values']['re_name']);
 
-    $re_uname     = trim($form_state['values']['re_uname']);
 
-    $re_accession = trim($form_state['values']['re_accession']);
 
-    $db_id        = $form_state['values']['db_id'];
 
-    $rel_type     = $form_state['values']['rel_type'];
 
-    $re_subject   = trim($form_state['values']['re_subject']);
 
-    $parent_type   = trim($form_state['values']['parent_type']);
 
-    // check to see if the file is located local to Drupal
 
-    $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $fasta_file; 
 
-    if(!file_exists($dfile)){
 
-       // if not local to Drupal, the file must be someplace else, just use
 
-       // the full path provided
 
-       $dfile = $fasta_file;
 
-    }
 
-    if(!file_exists($dfile)){
 
-       form_set_error('fasta_file',t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
 
-    }
 
-    // make sure if a relationship is specified that all fields are provided.
 
-    if(($rel_type or $parent_type) and !$re_subject){
 
-       form_set_error('re_subject',t("Please provide a regular expression for the parent"));
 
-    }
 
-    if(($rel_type or $re_subject) and !$parent_type){
 
-       form_set_error('parent_type',t("Please provide a SO term for the parent"));
 
-    }
 
-    if(($parent_type or $re_subject) and !$rel_type){
 
-       form_set_error('rel_type',t("Please select a relationship type"));
 
-    }
 
-    // make sure if a database is specified that all fields are provided
 
-    if($db_id and !$re_accession){
 
-       form_set_error('re_accession',t("Please provide a regular expression for the accession"));
 
-    }
 
-    if($re_accession and !$db_id){
 
-       form_set_error('db_id',t("Please select a database"));
 
-    }
 
-    // check to make sure the types exists
 
-    $cvtermsql = "SELECT CVT.cvterm_id
 
-                  FROM {cvterm} CVT
 
-                     INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
 
-                     LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
 
-                  WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
 
-    $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
 
-    if(!$cvterm){
 
-       form_set_error('type',t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
 
-    }
 
-    if($rel_type){
 
-       $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
 
-       if(!$cvterm){
 
-          form_set_error('parent_type',t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
 
-       }
 
-    }
 
-    // check to make sure the 'relationship' and 'sequence' ontologies are loaded
 
-    $form_state['storage']['dfile'] = $dfile;
 
- }
 
- /**
 
-  *
 
-  *
 
-  * @ingroup fasta_loader
 
-  */
 
- function tripal_feature_fasta_load_form_submit ($form, &$form_state){
 
-    global $user;
 
-    $dfile        = $form_state['storage']['dfile'];
 
-    $organism_id  = $form_state['values']['organism_id'];
 
-    $type         = trim($form_state['values']['type']);
 
-    $update       = trim($form_state['values']['update']);
 
-    $library_id   = $form_state['values']['library_id'];
 
-    $re_name      = trim($form_state['values']['re_name']);
 
-    $re_uname     = trim($form_state['values']['re_uname']);
 
-    $re_accession = trim($form_state['values']['re_accession']);
 
-    $db_id        = $form_state['values']['db_id'];
 
-    $rel_type     = $form_state['values']['rel_type'];
 
-    $re_subject   = trim($form_state['values']['re_subject']);
 
-    $parent_type   = trim($form_state['values']['parent_type']);
 
-    $args = array($dfile,$organism_id,$type,$library_id,$re_name,$re_uname,
 
-             $re_accession,$db_id,$rel_type,$re_subject,$parent_type,$update,$user->uid);
 
-    tripal_add_job("Import FASTA file: $dfile",'tripal_feature',
 
-       'tripal_feature_load_fasta',$args,$user->uid);
 
- }
 
- /**
 
-  *
 
-  *
 
-  * @ingroup fasta_loader
 
-  */
 
- function tripal_feature_load_fasta($dfile, $organism_id, $type,
 
-    $library_id, $re_name, $re_uname, $re_accession, $db_id, $rel_type,
 
-    $re_subject, $parent_type, $update,$uid, $job = NULL)
 
- {
 
-    print "Opening FASTA file $dfile\n";
 
-     
 
-    $lines = file($dfile,FILE_SKIP_EMPTY_LINES);
 
-    $i = 0;
 
-    $name = '';
 
-    $residues = '';
 
-    $num_lines = sizeof($lines);
 
-    $interval = intval($num_lines * 0.01);
 
-    if($interval == 0){
 
-       $interval = 1;
 
-    }
 
-    foreach ($lines as $line_num => $line) {
 
-       $i++;  // update the line count     
 
-       // update the job status every 1% features
 
-       if($job and $i % $interval == 0){
 
-          tripal_job_set_progress($job,intval(($i/$num_lines)*100));
 
-       }
 
-       // get the name, uniquename, accession and relationship subject from
 
-       // the definition line
 
-       if(preg_match('/^>/',$line)){
 
-          // if we have a feature name then we are starting a new sequence
 
-          // and we need to insert this one
 
-          if($name){
 
-            tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,
 
-               $accession,$subject,$rel_type,$parent_type,$library_id,$organism_id,$type,
 
-               $source,$residues,$update);
 
-            $residues = '';
 
-            $name = '';
 
-          }
 
-          $line = preg_replace("/^>/",'',$line);
 
-          if($re_name){
 
-             if(!preg_match("/$re_name/",$line,$matches)){
 
-                print "Regular expression for the feature name finds nothing\n";
 
-             }
 
-             $name = trim($matches[1]);
 
-          } else {
 
-             preg_match("/^(.*?)[\s\|].*$/",$line,$matches);
 
-             $name = trim($matches[1]);
 
-          }
 
-          if($re_uname){
 
-             preg_match("/$re_uname/",$line,$matches);
 
-             $uname = trim($matches[1]);
 
-          } else {
 
-             preg_match("/^(.*?)[\s\|].*$/",$line,$matches);
 
-             $uname = trim($matches[1]);
 
-          }         
 
-          preg_match("/$re_accession/",$line,$matches);
 
-          $accession = trim($matches[1]);
 
-          preg_match("/$re_subject/",$line,$matches);
 
-          $subject = trim($matches[1]);
 
- //         print "Name: $name, UName: $uname, Accession: $accession, Subject: $subject\n";
 
-       }
 
-       else {
 
-          $residues .= trim($line);
 
-       }
 
-    }
 
-    // now load the last sequence in the file
 
-    tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,
 
-       $accession,$subject,$rel_type,$parent_type,$library_id,$organism_id,$type,
 
-       $source,$residues,$update,$re_name);
 
-    return '';
 
- }
 
- /**
 
-  *
 
-  *
 
-  * @ingroup fasta_loader
 
-  */
 
- function tripal_feature_fasta_loader_insert_feature($name,$uname,$db_id,$accession,
 
-               $parent,$rel_type,$parent_type,$library_id,$organism_id,$type, 
 
-               $source,$residues,$update,$re_name) 
 
- {
 
-    $previous_db = tripal_db_set_active('chado');
 
-    // first get the type for this sequence
 
-    $cvtermsql = "SELECT CVT.cvterm_id
 
-                  FROM {cvterm} CVT
 
-                     INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
 
-                     LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
 
-                  WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
 
-    $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
 
-    if(!$cvterm){
 
-       print "ERROR: cannot find the term type: '$type'\n";
 
-       return 0;
 
-    }
 
-    // check to see if this feature already exists
 
-    $feature_sql = "SELECT * FROM {feature} 
 
-                    WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
 
-    $feature = db_fetch_object(db_query($feature_sql,$organism_id,$uname,$cvterm->cvterm_id));
 
-    if(!$feature){
 
-       // now insert the feature
 
-       $sql = "INSERT INTO {feature} (organism_id, name, uniquename, residues, seqlen, md5checksum,type_id,is_analysis,is_obsolete)
 
-               VALUES(%d,'%s','%s','%s',%d, '%s', %d, %s, %s)";
 
-       $result = db_query($sql,$organism_id,$name,$uname,$residues,strlen($residues),
 
-                   md5($residues),$cvterm->cvterm_id,'false','false');
 
-       if(!$result){
 
-          print "ERROR: failed to insert feature '$name ($uname)'\n";
 
-          return 0;
 
-       } else {
 
-          print "Inserted feature $name ($uname)\n";
 
-       }
 
-    } else {
 
-        if($update){
 
-          // we do not want to wipe out the name if the user did not intend for this to
 
-          // happen.  The uniquename must match the sequence but the name may not.  
 
-          // so, we'll only update the name if the users specified an 're_name' regular
 
-          // expression.
 
-          if($re_name){
 
-             $sql = "UPDATE {feature} 
 
-                      SET name = '%s', residues = '%s', seqlen = '%s', md5checksum = '%s'
 
-                      WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
 
-             $result = db_query($sql,$name,$residues,strlen($residues),md5($residues),$organism_id,$uname,$cvterm->cvterm_id);
 
-          } else {
 
-             $sql = "UPDATE {feature} 
 
-                      SET residues = '%s', seqlen = '%s', md5checksum = '%s'
 
-                      WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
 
-             $result = db_query($sql,$residues,strlen($residues),md5($residues),$organism_id,$uname,$cvterm->cvterm_id);
 
-          }
 
-          if(!$result){
 
-             print "ERROR: failed to update feature '$name ($uname)'\n";
 
-             return 0;
 
-          } else {
 
-             print "Updated feature $name ($uname)\n";
 
-          }
 
-       } else {
 
-          print "WARNING: feature already exists, skipping: '$name ($uname)'\n";
 
-       }
 
-    }
 
-    // now get the feature
 
-    $feature = db_fetch_object(db_query($feature_sql,$organism_id,$uname,$cvterm->cvterm_id));
 
-    if(!$feature){
 
-       print "Something bad has happened: $organism_id, $uname, $cvterm->cvterm_id\n";
 
-       return 0;
 
-    }
 
-    // now add the database cross reference
 
-    if($db_id){
 
-       // check to see if this accession reference exists, if not add it
 
-       $dbxrefsql = "SELECT * FROM {dbxref} WHERE db_id = %s and accession = '%s'";
 
-       $dbxref = db_fetch_object(db_query($dbxrefsql,$db_id,$accession));
 
-       if(!$dbxref){
 
-          $sql = "INSERT INTO {dbxref} (db_id,accession) VALUES (%d,'%s')";
 
-          $result = db_query($sql,$db_id,$accession);
 
-          if(!$result){
 
-            print "WARNING: could not add external database acession: '$name accession: $accession'\n";
 
-          }
 
-          $dbxref = db_fetch_object(db_query($dbxrefsql,$db_id,$accession));
 
-       }
 
-       // check to see if the feature dbxref record exists if not, then add it 
 
-       $fdbxrefsql = "SELECT * FROM {feature_dbxref} WHERE feature_id = %d and dbxref_id = %d";
 
-       $fdbxref = db_fetch_object(db_query($fdbxrefsql,$feature->feature_id,$dbxref->dbxref_id));
 
-       if(!$fdbxref){
 
-          $sql = "INSERT INTO {feature_dbxref} (feature_id,dbxref_id) VALUES (%d,%d)";
 
-          $result = db_query($sql,$feature->feature_id,$dbxref->dbxref_id);
 
-          if(!$result){
 
-             print "WARNING: could not associate database cross reference with feature: '$name accession: $accession'\n";
 
-          } else {
 
-             print "Added database crossreference $name ($uname) -> $accession\n";
 
-          }
 
-       }
 
-    }
 
-    // now add in the relationship if one exists.  First, get the parent type for the relationship
 
-    // then get the parent feature 
 
-    if($rel_type){
 
-       $parentcvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
 
-       $relcvterm = db_fetch_object(db_query($cvtermsql,'relationship',$rel_type,$rel_type));
 
-       $parent_feature = db_fetch_object(db_query($feature_sql,$organism_id,$parent,$parentcvterm->cvterm_id));
 
-       if($parent_feature){
 
-          // check to see if the relationship already exists
 
-          $sql = "SELECT * FROM {feature_relationship} WHERE subject_id = %d and object_id = %d and type_id = %d";
 
-          $rel = db_fetch_object(db_query($sql,$feature->feature_id,$parent_feature->feature_id,$relcvterm->cvterm_id));
 
-          if($rel){
 
-             print "WARNING: relationship already exists, skipping '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
 
-          } else {      
 
-             $sql = "INSERT INTO {feature_relationship} (subject_id,object_id,type_id)
 
-                     VALUES (%d,%d,%d)";
 
-             $result = db_query($sql,$feature->feature_id,$parent_feature->feature_id,$relcvterm->cvterm_id);
 
-             if(!$result){
 
-                print "WARNING: failed to insert feature relationship '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
 
-             } else {
 
-                print "Inserted relationship relationship: '$uname' ($type) $rel_type '$parent' ($parent_type)\n";
 
-             }
 
-          } 
 
-       }
 
-       else {
 
-          print "WARNING: cannot establish relationship '$uname' ($type) $rel_type '$parent' ($parent_type): Cannot find the parent\n";
 
-       }
 
-    }
 
-    tripal_db_set_active($previous_db);
 
- }
 
 
  |