|
@@ -0,0 +1,403 @@
|
|
|
+<?php
|
|
|
+
|
|
|
+function tripal_analysis_go_gaf_load_form (){
|
|
|
+
|
|
|
+ $form['notice']= array(
|
|
|
+ '#type' => 'item',
|
|
|
+ '#value' => t('Note: currently, the GAF loader only uses column 2 (Object ID) and 5 (GO ID)
|
|
|
+ from the GAF file, and simply imports GO terms for the features.
|
|
|
+ Further support for this file format will be provided later.'),
|
|
|
+ );
|
|
|
+
|
|
|
+ $form['gaf_file']= array(
|
|
|
+ '#type' => 'textfield',
|
|
|
+ '#title' => t('GAF File'),
|
|
|
+ '#description' => t('Please enter the full system path for the GAF file, or a path within the Drupal
|
|
|
+ installation (e.g. /sites/default/files/xyz.txt). The path must be accessible to the
|
|
|
+ server on which this Drupal instance is running.'),
|
|
|
+ '#required' => TRUE,
|
|
|
+ );
|
|
|
+ // get the list of organisms
|
|
|
+ $sql = "SELECT * FROM {organism} ORDER BY genus, species";
|
|
|
+ $previous_db = tripal_db_set_active('chado'); // use chado database
|
|
|
+ $org_rset = db_query($sql);
|
|
|
+ tripal_db_set_active($previous_db); // now use drupal database
|
|
|
+ $organisms = array();
|
|
|
+ $organisms[''] = '';
|
|
|
+ while($organism = db_fetch_object($org_rset)){
|
|
|
+ $organisms[$organism->organism_id] = "$organism->genus $organism->species ($organism->common_name)";
|
|
|
+ }
|
|
|
+ $form['type']= array(
|
|
|
+ '#type' => 'textfield',
|
|
|
+ '#title' => t('Sequence Type'),
|
|
|
+ '#required' => TRUE,
|
|
|
+ '#description' => t('Please enter the Sequence Ontology term that describes the sequences in the GAF file.'),
|
|
|
+ );
|
|
|
+ $form['organism_id'] = array (
|
|
|
+ '#title' => t('Organism'),
|
|
|
+ '#type' => t('select'),
|
|
|
+ '#description' => t("Choose the organism to which these sequences are associated "),
|
|
|
+ '#required' => TRUE,
|
|
|
+ '#options' => $organisms,
|
|
|
+ );
|
|
|
+ $form['import_options'] = array(
|
|
|
+ '#type' => 'fieldset',
|
|
|
+ '#title' => t('Import Options'),
|
|
|
+ '#collapsed' => TRUE
|
|
|
+ );
|
|
|
+ $form['import_options']['add_only']= array(
|
|
|
+ '#type' => 'checkbox',
|
|
|
+ '#title' => t('Add GO terms'),
|
|
|
+ '#required' => FALSE,
|
|
|
+ '#description' => t('GO terms in the GAF file will be added to each feature.'),
|
|
|
+ );
|
|
|
+// $form['import_options']['replace']= array(
|
|
|
+// '#type' => 'checkbox',
|
|
|
+// '#title' => t('Replace GO terms'),
|
|
|
+// '#required' => FALSE,
|
|
|
+// '#description' => t('All GO terms for features in the GAF file will be replaced with terms in the GAF file.'),
|
|
|
+// );
|
|
|
+ $form['import_options']['remove']= array(
|
|
|
+ '#type' => 'checkbox',
|
|
|
+ '#title' => t('Delete GO terms'),
|
|
|
+ '#required' => FALSE,
|
|
|
+ '#description' => t('GO terms for features in the GAF file will be removed. Other terms will remain.'),
|
|
|
+ );
|
|
|
+
|
|
|
+ $form['analysis'] = array(
|
|
|
+ '#type' => 'fieldset',
|
|
|
+ '#title' => t('Analysis Used to Derive GO terms'),
|
|
|
+ '#collapsed' => TRUE
|
|
|
+ );
|
|
|
+
|
|
|
+ // get the list of organisms
|
|
|
+ $sql = "SELECT * FROM {analysis} ORDER BY name";
|
|
|
+ $previous_db = tripal_db_set_active('chado'); // use chado database
|
|
|
+ $org_rset = db_query($sql);
|
|
|
+ tripal_db_set_active($previous_db); // now use drupal database
|
|
|
+ $analyses = array();
|
|
|
+ $analyses[''] = '';
|
|
|
+ while($analysis = db_fetch_object($org_rset)){
|
|
|
+ $analyses[$analysis->analysis_id] = "$analysis->name ($analysis->program $analysis->programversion, $analysis->sourcename)";
|
|
|
+ }
|
|
|
+ $form['analysis']['analysis_id'] = array (
|
|
|
+ '#title' => t('Analysis'),
|
|
|
+ '#type' => t('select'),
|
|
|
+ '#description' => t("Choose the analysis that defines how the GO annotations in the GAF file were created. "),
|
|
|
+ '#required' => TRUE,
|
|
|
+ '#options' => $analyses,
|
|
|
+ );
|
|
|
+
|
|
|
+
|
|
|
+ // Advanced Options
|
|
|
+ $form['advanced'] = array(
|
|
|
+ '#type' => 'fieldset',
|
|
|
+ '#title' => t('Advanced Options'),
|
|
|
+ '#collapsed' => TRUE
|
|
|
+ );
|
|
|
+ $form['advanced']['re_help']= array(
|
|
|
+ '#type' => 'item',
|
|
|
+ '#value' => t('A regular expression is an advanced method for extracting information from a string of text.
|
|
|
+ By default, this loader will use the first word in the second column of the GAF file
|
|
|
+ as the uniquename for the sequences. If this is not desired, you may use the following regular
|
|
|
+ expressions to define the location of the name or unique name within the text of column 2.'),
|
|
|
+ );
|
|
|
+
|
|
|
+ $form['advanced']['re_name']= array(
|
|
|
+ '#type' => 'textfield',
|
|
|
+ '#title' => t('Regular expression for the name'),
|
|
|
+ '#required' => FALSE,
|
|
|
+ '#description' => t('Enter the regular expression that will extract the feature name. For example, for text with a name and uniquename separated by a bar \'|\' (>seqname|uniquename), the regular expression would be, "^(.*?)\|.*$". the name must be unique for this organism and sequence type.' ),
|
|
|
+ );
|
|
|
+
|
|
|
+ $form['advanced']['re_uname']= array(
|
|
|
+ '#type' => 'textfield',
|
|
|
+ '#title' => t('Regular expression for the unique name'),
|
|
|
+ '#required' => FALSE,
|
|
|
+ '#description' => t('Enter the regular expression that will extract the unique feature name. For example, for text with a name and uniquename separated by a bar \'|\' (>seqname|uniquename), the regular expression would be, "^.*?\|(.*)$". the name must be unique for this organism and sequence type.' ),
|
|
|
+ );
|
|
|
+
|
|
|
+ $form['button'] = array(
|
|
|
+ '#type' => 'submit',
|
|
|
+ '#value' => t('Import GAF file'),
|
|
|
+ );
|
|
|
+ return $form;
|
|
|
+}
|
|
|
+/**
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * @ingroup gff3_loader
|
|
|
+ */
|
|
|
+function tripal_analysis_go_gaf_load_form_validate ($form, &$form_state){
|
|
|
+
|
|
|
+ $gaf_file = $form_state['values']['gaf_file'];
|
|
|
+ $organism_id = $form_state['values']['organism_id'];
|
|
|
+ $add_only = $form_state['values']['add_only'];
|
|
|
+ $remove = $form_state['values']['remove'];
|
|
|
+ $replace = $form_state['values']['replace'];
|
|
|
+ $analysis_id = $form_state['values']['analysis_id'];
|
|
|
+ $type = trim($form_state['values']['type']);
|
|
|
+ $re_name = trim($form_state['values']['re_name']);
|
|
|
+ $re_uname = trim($form_state['values']['re_uname']);
|
|
|
+
|
|
|
+
|
|
|
+ // check to see if the file is located local to Drupal
|
|
|
+ $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gaf_file;
|
|
|
+ if(!file_exists($dfile)){
|
|
|
+ // if not local to Drupal, the file must be someplace else, just use
|
|
|
+ // the full path provided
|
|
|
+ $dfile = $gaf_file;
|
|
|
+ }
|
|
|
+ if(!file_exists($dfile)){
|
|
|
+ form_set_error('gff_file',t("Cannot find the file on the system. Check that the file exists or that the web server has permissions to read the file."));
|
|
|
+ }
|
|
|
+
|
|
|
+ if (($add_only and ($remove or $replace)) or
|
|
|
+ ($replace and ($add_only or $remove)) or
|
|
|
+ ($remove and ($replace or $add_only))){
|
|
|
+ form_set_error('add_only',t("Please select only one checkbox from the import options section"));
|
|
|
+ }
|
|
|
+
|
|
|
+ if($re_name and $re_uname){
|
|
|
+ form_set_error('re_name',t("Please provide a regular expression for the name or the unique name only, not both."));
|
|
|
+ }
|
|
|
+
|
|
|
+ // check to make sure the types exists
|
|
|
+ $cvtermsql = "SELECT CVT.cvterm_id
|
|
|
+ FROM {cvterm} CVT
|
|
|
+ INNER JOIN {cv} CV on CVT.cv_id = CV.cv_id
|
|
|
+ LEFT JOIN {cvtermsynonym} CVTS on CVTS.cvterm_id = CVT.cvterm_id
|
|
|
+ WHERE cv.name = '%s' and (CVT.name = '%s' or CVTS.synonym = '%s')";
|
|
|
+ $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$type,$type));
|
|
|
+ if(!$cvterm){
|
|
|
+ form_set_error('type',t("The Sequence Ontology (SO) term selected for the sequence type is not available in the database. Please check spelling or select another."));
|
|
|
+ }
|
|
|
+ if($rel_type){
|
|
|
+ $cvterm = db_fetch_object(db_query($cvtermsql,'sequence',$parent_type,$parent_type));
|
|
|
+ if(!$cvterm){
|
|
|
+ form_set_error('parent_type',t("The Sequence Ontology (SO) term selected for the parent relationship is not available in the database. Please check spelling or select another."));
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ *
|
|
|
+ * @ingroup gff3_loader
|
|
|
+ */
|
|
|
+function tripal_analysis_go_gaf_load_form_submit ($form, &$form_state){
|
|
|
+ global $user;
|
|
|
+
|
|
|
+ $gaf_file = $form_state['values']['gaf_file'];
|
|
|
+ $organism_id = $form_state['values']['organism_id'];
|
|
|
+ $add_only = $form_state['values']['add_only'];
|
|
|
+ $remove = $form_state['values']['remove'];
|
|
|
+ $replace = $form_state['values']['replace'];
|
|
|
+ $analysis_id = $form_state['values']['analysis_id'];
|
|
|
+ $type = trim($form_state['values']['type']);
|
|
|
+ $re_name = trim($form_state['values']['re_name']);
|
|
|
+ $re_uname = trim($form_state['values']['re_uname']);
|
|
|
+
|
|
|
+
|
|
|
+ $args = array($gaf_file,$organism_id,$analysis_id,$add_only,$replace,$remove,$re_name,$re_uname,$type);
|
|
|
+ if($add_only){
|
|
|
+ $type = 'add GO terms';
|
|
|
+ }
|
|
|
+ if($replace){
|
|
|
+ $type = 'replace and add GO terms';
|
|
|
+ }
|
|
|
+ if($remove){
|
|
|
+ $type = 'remove GO terms';
|
|
|
+ }
|
|
|
+ tripal_add_job("Import GAF 2.0 file $gaf_file and $type",'tripal_analysis_go',
|
|
|
+ 'tripal_analysis_go_load_gaf',$args,$user->uid);
|
|
|
+
|
|
|
+ return '';
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ *
|
|
|
+ *
|
|
|
+ * @ingroup gff3_loader
|
|
|
+ */
|
|
|
+function tripal_analysis_go_load_gaf($gaf_file, $organism_id,$analysis_id,$add_only =0,
|
|
|
+ $replace = 0, $remove = 0, $re_name, $re_uname, $type, $job = NULL)
|
|
|
+{
|
|
|
+ print "Opening GAF file $gaf_file\n";
|
|
|
+
|
|
|
+
|
|
|
+ $lines = file($gaf_file,FILE_SKIP_EMPTY_LINES);
|
|
|
+ $i = 0;
|
|
|
+
|
|
|
+ $name = '';
|
|
|
+ $residues = '';
|
|
|
+ $num_lines = sizeof($lines);
|
|
|
+ $interval = intval($num_lines * 0.01);
|
|
|
+ if($interval == 0){
|
|
|
+ $interval = 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ foreach ($lines as $line_num => $line) {
|
|
|
+ $i++; // update the line count
|
|
|
+
|
|
|
+ // skip comments
|
|
|
+ if(preg_match('/^\!/',$line)){
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ // update the job status every 1% features
|
|
|
+ if($job and $i % $interval == 0){
|
|
|
+ tripal_job_set_progress($job,intval(($i/$num_lines)*100));
|
|
|
+ }
|
|
|
+
|
|
|
+ // split the line into it's columns
|
|
|
+ $cols = explode("\t",$line);
|
|
|
+ if(sizeof($cols) < 15){
|
|
|
+ print "ERROR: improper number of columns on line $i\n";
|
|
|
+ print_r($cols);
|
|
|
+ return '';
|
|
|
+ }
|
|
|
+ $db = $cols[0];
|
|
|
+ $object = $cols[1];
|
|
|
+ $symbol = $cols[2];
|
|
|
+ $qualifier = $cols[3];
|
|
|
+ $go_id = $cols[4];
|
|
|
+ $dbxref = $cols[5];
|
|
|
+ $ecode = $cols[6];
|
|
|
+ $with = $cols[7];
|
|
|
+ $aspect = $cols[8];
|
|
|
+ $obj_name = $cols[9];
|
|
|
+ $obj_syn = $cols[10];
|
|
|
+ $obj_type = $cols[11];
|
|
|
+ $taxon = $cols[12];
|
|
|
+ $date = $cols[13];
|
|
|
+ $assigned = $cols[14];
|
|
|
+ $exten = $cols[15];
|
|
|
+ $product = $cols[16];
|
|
|
+
|
|
|
+ // get the name or uniquename for the feature
|
|
|
+ $uname = $object;
|
|
|
+ $name = '';
|
|
|
+ if($re_name){
|
|
|
+ if(!preg_match("/$re_name/",$object,$matches)){
|
|
|
+ print "Regular expression for the feature name finds nothing\n";
|
|
|
+ } else {
|
|
|
+ $name = trim($matches[1]);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ preg_match("/^\s*(.*?)[\s\|].*$/",$object,$matches);
|
|
|
+ $name = trim($matches[1]);
|
|
|
+ }
|
|
|
+ if($re_uname){
|
|
|
+ if(!preg_match("/$re_uname/",$object,$matches)){
|
|
|
+ print "Regular expression for the feature unique name finds nothing\n";
|
|
|
+ } else {
|
|
|
+ $uname = trim($matches[1]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // get the feature
|
|
|
+ $values = array(
|
|
|
+ 'type_id' => array(
|
|
|
+ 'cv_id' => array(
|
|
|
+ 'name' => 'sequence'
|
|
|
+ ),
|
|
|
+ 'name' => $type,
|
|
|
+ ),
|
|
|
+ 'organism_id' => $organism_id,
|
|
|
+ );
|
|
|
+ if($name){
|
|
|
+ $values['name'] = $name;
|
|
|
+ }
|
|
|
+ if($uname){
|
|
|
+ $values['uniquename'] = $uname;
|
|
|
+ }
|
|
|
+ $feature = tripal_core_chado_select('feature',array('*'),$values);
|
|
|
+
|
|
|
+ // add the GO term to the feature
|
|
|
+ tripal_analysis_go_load_gaff_go_term($feature[0],$go_id,$remove);
|
|
|
+ }
|
|
|
+ return 1;
|
|
|
+}
|
|
|
+/**
|
|
|
+*
|
|
|
+*/
|
|
|
+function tripal_analysis_go_load_gaff_go_term($feature,$dbxref,$remove){
|
|
|
+
|
|
|
+ // get the database name from the reference. If it doesn't exist then create one.
|
|
|
+ $ref = explode(":",$dbxref);
|
|
|
+ $dbname = $ref[0];
|
|
|
+ $accession = $ref[1];
|
|
|
+
|
|
|
+ // first look for the database name
|
|
|
+ $db = tripal_core_chado_select('db',array('db_id'),array('name' => "DB:$dbname"));
|
|
|
+ if(sizeof($db) == 0){
|
|
|
+ $db = tripal_core_chado_select('db',array('db_id'),array('name' => "$dbname"));
|
|
|
+ }
|
|
|
+ if(sizeof($db) == 0){
|
|
|
+ print "ERROR: Database, $dbname is missing for reference: $dbname:$accession\n";
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ $db = $db[0];
|
|
|
+
|
|
|
+ // now check to see if the accession exists
|
|
|
+ $dbxref = tripal_core_chado_select('dbxref',array('dbxref_id'),array(
|
|
|
+ 'accession' => $accession,'db_id' => $db->db_id));
|
|
|
+ if(sizeof($dbxref) == 0){
|
|
|
+ print "ERROR: Accession, $accession is missing for reference: $dbname:$accession\n";
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ $dbxref = $dbxref[0];
|
|
|
+
|
|
|
+ // now check to see if the cvterm exists
|
|
|
+ $cvterm = tripal_core_chado_select('cvterm',array('cvterm_id'),array(
|
|
|
+ 'dbxref_id' => $dbxref->dbxref_id));
|
|
|
+ // if it doesn't exist in the cvterm table, look for an alternate id
|
|
|
+ if(sizeof($cvterm) == 0){
|
|
|
+ $cvterm = tripal_core_chado_select('cvterm_dbxref',array('cvterm_id'),array(
|
|
|
+ 'dbxref_id' => $dbxref->dbxref_id));
|
|
|
+ }
|
|
|
+ if(sizeof($cvterm) == 0){
|
|
|
+ print "ERROR: CVTerm is missing for reference: $dbname:$accession\n";
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ $cvterm = $cvterm[0];
|
|
|
+
|
|
|
+
|
|
|
+ // check to see if this feature cvterm already exists
|
|
|
+ $fcvt = tripal_core_chado_select('feature_cvterm',array('feature_cvterm_id'),
|
|
|
+ array('cvterm_id' => $cvterm->cvterm_id,'feature_id' => $feature->feature_id));
|
|
|
+
|
|
|
+ // now associate this feature with the cvterm if it doesn't already exist
|
|
|
+ if(sizeof($fcvt)==0){
|
|
|
+ $values = array(
|
|
|
+ 'feature_id' => $feature->feature_id,
|
|
|
+ 'cvterm_id' => $cvterm->cvterm_id,
|
|
|
+ 'pub_id' => array(
|
|
|
+ 'uniquename' => 'null',
|
|
|
+ ),
|
|
|
+ );
|
|
|
+ $ret = tripal_core_chado_insert('feature_cvterm',$values);
|
|
|
+
|
|
|
+ if($ret){
|
|
|
+ print " Added ontology term $dbname:$accession to feature $feature->uniquename\n";
|
|
|
+ } else {
|
|
|
+ print "ERROR: failed to insert ontology term: $dbname:$accession\n";
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ if($remove){
|
|
|
+ $status = tripal_core_chado_delete('feature_cvterm',
|
|
|
+ array('cvterm_id' => $cvterm->cvterm_id,
|
|
|
+ 'feature_id' => $feature->feature_id));
|
|
|
+ if(!$status){
|
|
|
+ print "ERROR: Failed to delete ontology term $dbname:$accession from feature $feature->uniquename\n";
|
|
|
+ } else {
|
|
|
+ print " Deleted ontology term $dbname:$accession from feature $feature->uniquename\n";
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ print " Ontology term already associated to feature $feature->uniquename, skipping $dbname:$accession\n";
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return 1;
|
|
|
+}
|