123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504 |
- <?php
- /*************************************************************************
- *
- */
- function tripal_core_gff3_load_form (){
- $form['gff_file']= array(
- '#type' => 'textfield',
- '#title' => t('GFF3 File'),
- '#description' => t('Please enter the full system path for the GFF file, or a path within the Drupal
- installation (e.g. /sites/default/files/xyz.obo). The path must be accessible to the
- server on which this Drupal instance is running.'),
- '#required' => TRUE,
- '#weight' => 1
- );
- $form['add_only']= array(
- '#type' => 'checkbox',
- '#title' => t('Import only new features'),
- '#required' => FALSE,
- '#default_value' => 'checked',
- '#description' => t('The job will skip features in the GFF file that already
- exist in the database and import only new features.'),
- '#weight' => 2
- );
- $form['update']= array(
- '#type' => 'checkbox',
- '#title' => t('Import all and update'),
- '#required' => FALSE,
- '#description' => t('Existing features will be updated and new features will be added. Attributes
- for a feature that are not present in the GFF but which are present in the
- database will not be altered.'),
- '#weight' => 3
- );
- $form['refresh']= array(
- '#type' => 'checkbox',
- '#title' => t('Import all and replace'),
- '#required' => FALSE,
- '#description' => t('Existing features will be updated and feature properties not
- present in the GFF file will be removed.'),
- '#weight' => 4
- );
- $form['remove']= array(
- '#type' => 'checkbox',
- '#title' => t('Delete features'),
- '#required' => FALSE,
- '#description' => t('Features present in the GFF file that exist in the database
- will be removed rather than imported'),
- '#weight' => 5
- );
- $form['button'] = array(
- '#type' => 'submit',
- '#value' => t('Import GFF3 file'),
- '#weight' => 10,
- );
- return $form;
- }
- /*************************************************************************
- *
- */
- function tripal_core_gff3_load_form_submit ($form, &$form_state){
- global $user;
- $gff_file = $form_state['values']['gff_file'];
- $add_only = $form_state['values']['add_only'];
- $update = $form_state['values']['update'];
- $refresh = $form_state['values']['refresh'];
- $remove = $form_state['values']['remove'];
- $args = array($gff_file,$add_only,$update,$refresh,$remove);
- $type = '';
- if($add_only){
- $type = 'import only new features';
- }
- if($update){
- $type = 'import all and update';
- }
- if($refresh){
- $type = 'import all and replace';
- }
- if($remove){
- $type = 'delete features';
- }
- tripal_add_job("Import GFF3 file: $type",'tripal_core',
- 'tripal_core_load_gff3',$args,$user->uid);
- return '';
- }
- /*************************************************************************
- *
- */
- function tripal_core_load_gff3($gff_file, $add_only =0, $update = 0, $refresh = 0, $remove = 0, $job = NULL){
-
- // check to see if the file is located local to Drupal
- $dfile = $_SERVER['DOCUMENT_ROOT'] . base_path() . $gff_file;
- if(!file_exists($dfile)){
- // if not local to Drupal, the file must be someplace else, just use
- // the full path provided
- $dfile = $gff_file;
- }
- if(!file_exists($dfile)){
- print "ERROR: cannot find the file: $dfile\n";
- return 0;
- }
- $previous_db = tripal_db_set_active('chado');
- print "Opening $gff_file\n";
-
- $lines = file($dfile,FILE_SKIP_EMPTY_LINES);
- $i = 0;
- // get the controlled vocaubulary that we'll be using. The
- // default is the 'sequence' ontology
- $vocab = 'sequence';
- $sql = "SELECT * FROM cv WHERE name = '%s'";
- $cv = db_fetch_object(db_query($sql,$vocab));
- if(!$cv){
- print "ERROR: cannot find the '$vocab' ontology\n";
- return '';
- }
- // get the organism for which this GFF3 file belongs
- $sql = "SELECT * FROM organism WHERE common_name = 'fruitfly'";
- $organism = db_fetch_object(db_query($sql));
- foreach ($lines as $line_num => $line) {
- $i++; // update the line count
- if(preg_match('/^#/',$line)){
- continue; // skip comments
- }
-
- // TODO: remove URL encoding
-
- $cols = explode("\t",$line);
- if(sizeof($cols) != 9){
- print "ERROR: improper number of columns on line $i\n";
- return '';
- }
- // get the column values
- $landmark = $cols[0];
- $source = $cols[1];
- $type = $cols[2];
- $start = $cols[3];
- $end = $cols[4];
- $score = $cols[5];
- $strand = $cols[6];
- $phase = $cols[7];
- $attrs = explode(";",$cols[8]); // split by a semi-colon
- // ready the start and stop for chado
- $fmin = $start;
- $fmax = $end;
- if($end < $start){
- $fmin = $end;
- $fmax = $start;
- }
- // format the strand for chado
- if(strcmp($strand,'.')==0){
- $strand = 0;
- }
- elseif(strcmp($strand,'+')==0){
- $strand = 1;
- }
- elseif(strcmp($strand,'-')==0){
- $strand = -1;
- }
- if(strcmp($phase,'.')==0){
- $phase = '';
- }
- // get the type record
- $sql = "SELECT * from cvterm CVT WHERE name = '%s' and cv_id = %d";
- $cvterm = db_fetch_object(db_query($sql,$type,$cv->cv_id));
- if(!$cvterm){
- print "ERROR: cannot find ontology term '$type' on line $i.\n";
- return '';
- }
-
- // break apart each of the attributes
- $tags = array();
- $attr_name = '';
- $attr_uniquename = '';
- $attr_residue_info = '';
- $attr_locgroup = 0;
- $attr_fmin_partial = 'f';
- $attr_fmax_partial = 'f';
- $attr_is_obsolete = 'f';
- $attr_is_analysis = 'f';
- $residues = '';
- foreach($attrs as $attr){
- $attr = rtrim($attr);
- $attr = ltrim($attr);
- if(strcmp($attr,'')==0){
- continue;
- }
- if(!preg_match('/^[^\=]+\=[^\=]+$/',$attr)){
- print "ERROR: attribute is not correctly formatted on line $i: $attr\n";
- return '';
- }
- // break apart each tag
- $tag = explode("=",$attr); // split by equals sign
- // multiple instances of an attribute are separated by commas
- $tags[$tag[0]] = explode(",",$tag[1]); // split by comma
- if(strcmp($tag[0],'ID')==0){
- $attr_uniquename = $tag[1];
- }
- if(strcmp($tag[0],'Name')==0){
- $attr_name = $tag[1];
- }
- }
- if(strcmp($attr_name,'')==0){
- $attr_name = $attr_uniquename;
- }
- // skip features that have no ID attribute
- if(!$attr_uniquename){
- continue;
- }
-
- // if the option is to remove or refresh then we want to remove
- // the feature from the database.
- if($remove or $refresh){
- print "Removing feature '$attr_uniquename'\n";
- $sql = "DELETE FROM {feature}
- WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
- $result = db_query($sql,$organism->organism_id,$attr_uniquename,$cvterm->cvterm_id);
- if(!$result){
- print "ERROR: cannot delete feature $attr_uniquename\n";
- }
- $feature = 0;
- }
- // add or update the feature and all properties
- if($update or $refresh or $add_only){
- // add/update the feature
- $feature = tripal_core_load_gff3_feature($organism,$cvterm,
- $attr_uniquename,$attr_name,$residues,$attr_is_analysis,$attr_is_obsolete,
- $add_only);
- if($feature){
- // add/update the featureloc if the landmark and the feature ID are not the same
- // if they are the same then this entry in the GFF is probably a landmark identifier
- if(strcmp($landmark,$attr_uniquename)!=0){
- tripal_core_load_gff3_featureloc($feature,$organism,
- $landmark,$fmin,$fmax,$strand,$phase,$attr_fmin_partial,
- $attr_fmax_partial,$attr_residue_info,$attr_locgroup);
- }
- // add any aliases for this feature
- if(array_key_exists('Alias',$tags)){
- tripal_core_load_gff3_alias($feature,$tags['Alias']);
- }
- // add any aliases for this feature
- if(array_key_exists('Dbxref',$tags)){
- tripal_core_load_gff3_dbxref($feature,$tags['Dbxref']);
- }
- }
- }
- }
- tripal_db_set_active($previous_db);
- return '';
- }
- /*************************************************************************
- *
- */
- function tripal_core_load_gff3_dbxref($feature,$dbxrefs){
- foreach($dbxrefs as $dbxref){
- print "Adding: $dbxref\n";
- }
- }
- /*************************************************************************
- *
- */
- function tripal_core_load_gff3_alias($feature,$aliases){
- // make sure we have a 'synonym_type' vocabulary
- $sql = "SELECT * FROM {cv} WHERE name='synonym_type'";
- $syncv = db_fetch_object(db_query($sql));
- if(!$syncv){
- $sql = "INSERT INTO {cv} (name,definition) VALUES ('synonym_type','')";
- if(!db_query($sql)){
- print("ERROR: Failed to add the synonyms type vocabulary");
- return 0;
- }
- $syncv = db_fetch_object(db_query($sql));
- }
- // get the 'exact' cvterm, which is the type of synonym we're adding
- $cvtsql = "
- SELECT * FROM {cvterm} CVT
- INNER JOIN {cv} CV ON CVT.cv_id = CV.cv_id
- WHERE CVT.name = '%s' and CV.name = '%s'
- ";
- $syntype = db_fetch_object(db_query($cvtsql,'exact','synonym_type'));
- if(!$syntype){
- $term = array(
- 'name' => array('exact'),
- 'id' => array("internal:exact"),
- 'definition' => array(''),
- 'is_obsolete' => array(0),
- );
- $syntype = tripal_cv_obo_add_cv_term($term,$syncv,0,1);
- if(!$syntype){
- tripal_cv_obo_quiterror("Cannot add synonym type: internal:$type");
- }
- }
- // iterate through all of the aliases and add each one
- foreach($aliases as $alias){
- print " Adding Alias $alias\n";
- // check to see if the alias already exists in the synonym table
- // if not, then add it
- $synsql = "SELECT * FROM {synonym}
- WHERE name = '%s' and type_id = %d";
- $synonym = db_fetch_object(db_query($synsql,$alias,$syntype->cvterm_id));
- if(!$synonym){
- $sql = "INSERT INTO {synonym}
- (name,type_id,synonym_sgml)
- VALUES ('%s',%d,'%s')";
- $result = db_query($sql,$alias,$syntype->cvterm_id,'');
- if(!$result){
- print "ERROR: cannot add alias $alias to synonym table\n";
- }
- }
- $synonym = db_fetch_object(db_query($synsql,$alias,$syntype->cvterm_id));
- // check to see if we have a NULL publication in the pub table. If not,
- // then add one.
- $pubsql = "SELECT * FROM {pub} WHERE uniquename = 'null'";
- $pub = db_fetch_object(db_query($pubsql));
- if(!$pub){
- $sql = "INSERT INTO pub (uniquename,type_id) VALUES ('%s',
- (SELECT cvterm_id
- FROM cvterm CVT
- INNER JOIN dbxref DBX on DBX.dbxref_id = CVT.dbxref_id
- INNER JOIN db DB on DB.db_id = DBX.db_id
- WHERE CVT.name = 'null' and DB.name = 'null')";
- $result = db_query($sql,'null');
- if(!$result){
- print "ERROR: cannot add null publication needed for setup of alias\n";
- return 0;
- }
- }
- $pub = db_fetch_object(db_query($pubsql));
- // check to see if the synonym exists in the feature_synonym table
- // if not, then add it.
- $synsql = "SELECT * FROM {feature_synonym}
- WHERE synonym_id = %d and feature_id = %d and pub_id = %d";
- $fsyn = db_fetch_object(db_query($synsql,$synonym->synonym_id,$feature->feature_id,$pub->pub_id));
- if(!$fsyn){
- $sql = "INSERT INTO {feature_synonym}
- (synonym_id,feature_id,pub_id)
- VALUES (%d,%d,%d)";
- $result = db_query($sql,$synonym->synonym_id,$feature->feature_id,$pub->pub_id);
- if(!$result){
- print "ERROR: cannot add alias $alias to feature synonym table\n";
- return 0;
- }
- }
- $fsyn = db_fetch_object(db_query($synsql,$synonym->synonym_id,$feature->feature_id,$pub->pub_id));
- }
- return 1;
- }
- /*************************************************************************
- *
- */
- function tripal_core_load_gff3_feature($organism,$cvterm,$uniquename,$name,
- $residues,$is_analysis='f',$is_obsolete='f',$add_only) {
- // check to see if the feature already exists
- $feature_sql = "SELECT * FROM {feature}
- WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
- $feature = db_fetch_object(db_query($feature_sql,$organism->organism_id,$uniquename,$cvterm->cvterm_id));
- if(strcmp($is_obsolete,'f')==0){
- $is_obsolete = 'false';
- }
- if(strcmp($is_analysis,'f')==0){
- $is_analysis = 'false';
- }
- // insert the feature if it does not exist otherwise perform an update
- if(!$feature){
- print "Adding feature '$uniquename'\n";
- $sql = "INSERT INTO {feature} (organism_id, name, uniquename, residues, seqlen,
- md5checksum, type_id,is_analysis,is_obsolete)
- VALUES(%d,'%s','%s','%s',%d, '%s', %d, %s, %s)";
- $result = db_query($sql,$organism->organism_id,$name,$uniquename,$residues,strlen($residues),
- md5($residues),$cvterm->cvterm_id,$is_analysis,$is_obsolete);
- if(!$result){
- print "ERROR: failed to insert feature '$uniquename'\n";
- return 0;
- }
- }
- elseif(!$add_only) {
- print "Updating feature '$uniquename'\n";
- $sql = "UPDATE {feature}
- SET name = '%s', residues = '%s', seqlen = '%s', md5checksum = '%s',
- is_analysis = %s, is_obsolete = %s
- WHERE organism_id = %d and uniquename = '%s' and type_id = %d";
- $result = db_query($sql,$name,$residues,strlen($residues),md5($residues),$is_analysis,$is_obsolete);
- if(!$result){
- print "ERROR: failed to update feature '$uniquename'\n";
- return 0;
- }
- }
- else {
- // the feature exists and we don't want to update it so return
- // a value of 0. This will stop all downstream property additions
- print "Skipping existing feature: '$uniquename'.\n";
- return 0;
- }
- $feature = db_fetch_object(db_query($feature_sql,$organism->organism_id,$uniquename,$cvterm->cvterm_id));
- return $feature;
- }
- /*************************************************************************
- *
- */
- function tripal_core_load_gff3_featureloc($feature,$organism,$landmark,$fmin,$fmax,$strand,$phase,
- $is_fmin_partial,$is_fmax_partial,$residue_info,$locgroup) {
-
- // get the source feature
- $sql = "SELECT * FROM {feature}
- WHERE organism_id = %d and uniquename = '%s'";
- $srcfeature = db_fetch_object(db_query($sql,$organism->organism_id,$landmark));
- if(!$srcfeature){
- print "ERROR: cannot find source feature $landmark.\n";
- return 0;
- }
- // TODO: create an attribute that recognizes the residue_info,locgroup, is_fmin_partial and is_fmax_partial, right now these are
- // hardcoded to be false and 0 below.
- // check to see if this featureloc already exists, but also keep track of the
- // last rank value
- $rank = 0;
- $exists = 0;
- $featureloc_sql = "SELECT FL.featureloc_id,FL.fmin,FL.fmax, FL.is_fmin_partial,
- FL.is_fmax_partial, FL.strand, FL.phase, FL.residue_info,
- FL.locgroup, F.uniquename as srcname
- FROM {featureloc} FL
- INNER JOIN {feature} F on F.feature_id = FL.srcfeature_id
- WHERE FL.feature_id = %d
- ORDER BY rank ASC";
- $recs = db_query($featureloc_sql,$feature->feature_id);
- while ($featureloc = db_fetch_object($recs)){
- if(strcmp($featureloc->srcname,$landmark)==0 and
- $featureloc->fmin == $fmin and strcmp($featureloc->is_fmin_partial,$is_fmin_partial)==0 and
- $featureloc->fmax == $fmax and strcmp($featureloc->is_fmax_partial,$is_fmax_partial)==0 and
- $featureloc->phase == $phase and $featureloc->strand == $strand and
- strcmp($featureloc->residue_info,$residue_info)==0 and
- $featureloc->locgroup == $locgroup){
- // this is the same featureloc, so do nothing... no need to update
- //TODO: need more checks here
- print " No change to featureloc\n";
- $exists = 1;
- }
- $rank = $featureloc->rank;
- }
- if(!$exists){
- // this feature does not have a feature loc so add it
- if(!$phase){
- $phase = 'NULL';
- }
- if(strcmp($is_fmin_partial,'f')==0){
- $is_fmin_partial = 'false';
- }
- elseif(strcmp($is_fmin_partial,'t')==0){
- $is_fmin_partial = 'true';
- }
- if(strcmp($is_fmax_partial,'f')==0){
- $is_fmax_partial = 'false';
- }
- elseif(strcmp($is_fmax_partial,'t')==0){
- $is_fmax_partial = 'true';
- }
- print " Adding featureloc $srcfeature->uniquename fmin: $fmin, fmax: $fmax, strand: $strand, phase: $phase, rank: $rank\n";
- $fl_isql = "INSERT INTO {featureloc}
- (feature_id, srcfeature_id, fmin, is_fmin_partial, fmax, is_fmax_partial,
- strand, phase, residue_info, locgroup, rank)
- VALUES (%d,%d,%d,%s,%d,%s,%d,%s,'%s',%d,%d)";
- $result = db_query($fl_isql,$feature->feature_id,$srcfeature->feature_id,$fmin,$is_fmin_partial,$fmax,$is_fmax_partial,
- $strand,$phase,$residue_info,$locgroup,$rank);
- if(!$result){
- print "ERROR: failed to insert featureloc\n";
- return 0;
- }
- }
- return 1;
- }
- /*************************************************************************
- *
- */
|