My favorites | Sign in
Project Logo
Project hosting will be READ-ONLY Wednesday at 8am PST due to brief network maintenance.
                
Search
for
Updated Feb 04 (5 days ago) by jun.zhao...@googlemail.com
FlyBaseBenchmark  
details of SQL and SPARQL benchmark using flybase dataset

Query 1 – select gene names, symbols and synonyms

SPARQL

# flybase benchmark query 1 (SPARQL)

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>    
PREFIX chado: <http://purl.org/net/chado/schema/>
PREFIX so: <http://purl.org/obo/owl/SO#>
PREFIX syntype: <http://purl.org/net/flybase/synonym-types/>

SELECT DISTINCT ?flybaseid ?annotationsymbol ?fullname WHERE {

  ?feature chado:name "schuy"^^xsd:string ; 
    a so:SO_0000704 ;
    chado:organism  [
      chado:genus "Drosophila"^^xsd:string ;
      chado:species "melanogaster"^^xsd:string ;
    ] ;
    chado:is_analysis "false"^^xsd:boolean ;
    chado:is_obsolete "false"^^xsd:boolean;
    chado:uniquename ?flybaseid ;
    chado:feature_dbxref [ 
      chado:accession ?annotationsymbol ; 
      chado:db <http://openflydata.org/id/flybase/db/FlyBase_Annotation_IDs> 
    ] .

  ?feature_fullname 
    chado:feature ?feature ; 
    chado:is_current "true"^^xsd:boolean ;
    a chado:Feature_Synonym ;
    chado:synonym [ 
      a syntype:FullName ;
      chado:name ?fullname ; 
    ] ;
  .

}

Run on 2009-04-21 at 16:12 with 'schuy' ... 391, 281, 281, 282, 266, 265, 312, 250, 250, 281 (ms)

SQL

-- flybase benchmark query 1 (SQL)
SELECT DISTINCT 
feature.uniquename AS flybaseid, 
dbxref.accession AS annotationsymbol, 
synonym.name AS fullname
FROM feature
JOIN cvterm ON (feature.type_id = cvterm.cvterm_id)
JOIN organism ON (feature.organism_id = organism.organism_id)
JOIN feature_dbxref ON (feature.feature_id = feature_dbxref.feature_id)
JOIN dbxref ON (feature_dbxref.dbxref_id = dbxref.dbxref_id)
JOIN feature_synonym ON (feature.feature_id = feature_synonym.feature_id)
JOIN synonym ON (feature_synonym.synonym_id = synonym.synonym_id)
WHERE feature.name = 'schuy'
AND cvterm.cvterm_id = 219 
AND organism.genus = 'Drosophila'
AND organism.species = 'melanogaster'
AND feature.is_analysis = false
AND feature.is_obsolete = false
AND dbxref.db_id = 1 
AND feature_synonym.is_current = true
AND synonym.type_id = 60037 ;

Run on 2009-04-21 16:01 with 'schuy' ... 1516, 704, 703, 703, 704, 687, 704, 703, 688, 703 (ms)

Comparison

Data below are from 2009-04-21 10 queries each...

FlyBase Chado Relational Database OpenFlyData SPARQL Endpoint
Shortest (ms) 687 250
Longest (ms) 1516 391
Average (ms) 782 286
Standard deviation 258 41


Query 2 - select publications for a given gene

SPARQL

# flybase benchmark query 2 (SPARQL)

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX chado: <http://purl.org/net/chado/schema/>
PREFIX so: <http://purl.org/obo/owl/SO#>

SELECT DISTINCT ?pubid ?title ?pyear ?pages ?volume ?issue ?miniref WHERE {
  
  ?feature 
    chado:uniquename "FBgn0004644"^^xsd:string ;
    a so:SO_0000704 ;
    chado:organism  [
      chado:genus "Drosophila"^^xsd:string ;
      chado:species "melanogaster"^^xsd:string ;
    ] ;
    chado:is_analysis "false"^^xsd:boolean ;
    chado:is_obsolete "false"^^xsd:boolean ;
    chado:feature_pub ?pub .

  ?pub a chado:Pub ;
    chado:uniquename ?pubid .
  
  OPTIONAL { ?pub chado:title ?title }
  OPTIONAL { ?pub chado:pyear ?pyear }
  OPTIONAL { ?pub chado:pages ?pages }
  OPTIONAL { ?pub chado:volume ?volume }
  OPTIONAL { ?pub chado:issue ?issue }
  OPTIONAL { ?pub chado:miniref ?miniref }

}

Run on 2009-04-21 17:50 with 'FBgn0004464' (hh) ... 313, 641, 422, 688, 1954, 1031, 500, 687, 610, 734 (ms)

SQL

-- flybase benchmark query 2 (SQL)
SELECT DISTINCT
pub.uniquename AS pubid,
pub.title AS title,
pub.pyear AS pyear,
pub.pages AS pages,
pub.volume AS volume,
pub.issue AS issue,
pub.miniref AS miniref
FROM feature
JOIN cvterm ON (feature.type_id = cvterm.cvterm_id)
JOIN organism ON (feature.organism_id = organism.organism_id)
JOIN feature_pub ON (feature.feature_id = feature_pub.feature_id)
JOIN pub ON (feature_pub.pub_id = pub.pub_id)
WHERE feature.uniquename = 'FBgn0004644'
AND cvterm.cvterm_id = 219 
AND organism.genus = 'Drosophila'
AND organism.species = 'melanogaster'
AND feature.is_analysis = false
AND feature.is_obsolete = false ;

Run on 2009-04-21 17:03 with 'FBgn0004464' (hh) ... 6032, 1688, 1687, 1547, 1547, 1562, 1703, 2219, 2219, 2218 (ms)

Comparison

Data below are from 2009-04-21 10 queries each...

FlyBase Chado Relational Database OpenFlyData SPARQL Endpoint
Shortest (ms) 1547 313
Longest (ms) 6032 1954
Average (ms) 2242 758
Standard deviation 1362 463


Query 3 - Select genes whose range includes a given genome position

SPARQL

# flybase benchmark query 3 (sparql)

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX chado: <http://purl.org/net/chado/schema/>
PREFIX so: <http://purl.org/obo/owl/SO#>
 
SELECT ?name ?uniquename WHERE {
  
  {
    ?organism 
      chado:genus "Drosophila"^^xsd:string ;
      chado:species "melanogaster"^^xsd:string ;
      a chado:Organism ;
    .
  
    ?srcfeature
      chado:uniquename "3R"^^xsd:string ;
      a so:SO_0000105 ;
      chado:organism ?organism ;
    .
 
    ?featureloc
      chado:srcfeature ?srcfeature ;
      chado:fmin ?fmin ;
      chado:fmax ?fmax ;
    .
  
    FILTER ( ?fmin <= 21809000 && ?fmax >= 21809000 )
  }

  {
    ?featureloc chado:feature ?feature .
    ?feature a so:SO_0000704 ;
      chado:organism ?organism ;
      chado:is_analysis "false"^^xsd:boolean ;
      chado:is_obsolete "false"^^xsd:boolean ;
      chado:name ?name ; 
      chado:uniquename ?uniquename .
  }

}

Run on 2009-04-22 at ~11:39 ... 24, 22, 18, 16, 17, 19, 20, 19, 19, 21 (s)

Run on 2009-05-08 at ~08:50 ... 18, 18, 23, 20, 18, 20, 18, 22, 21, 19 (s)

SQL

-- flybase benchmark query 3 (sql)
SELECT 
feature.name AS name,
feature.uniquename AS uniquename
FROM organism
JOIN feature AS srcfeature ON (organism.organism_id = srcfeature.organism_id)
JOIN cvterm AS srcfeaturetype ON (srcfeature.type_id = srcfeaturetype.cvterm_id)
JOIN featureloc ON (srcfeature.feature_id = featureloc.srcfeature_id)
JOIN feature ON (featureloc.feature_id = feature.feature_id)
JOIN cvterm AS featuretype ON (feature.type_id = featuretype.cvterm_id)
WHERE organism.genus = 'Drosophila'
AND organism.species = 'melanogaster'
AND feature.organism_id = srcfeature.organism_id
AND srcfeature.uniquename = '3R' 
AND srcfeaturetype.cvterm_id = 210
AND featureloc.fmin <= 21809000
AND featureloc.fmax >= 21809000
AND featuretype.cvterm_id = 219
AND feature.is_analysis = false
AND feature.is_obsolete = false 
;

Run on 2009-04-22 at 11:16 ... > 10000s.

However, if you reorder this query as...

SELECT 
feature.name AS nm,
feature.uniquename AS uniquename
FROM feature AS srcfeature 
JOIN organism ON (organism.organism_id = srcfeature.organism_id)
JOIN featureloc ON (srcfeature.feature_id = featureloc.srcfeature_id)
JOIN feature ON (featureloc.feature_id = feature.feature_id)
JOIN cvterm AS featuretype ON (feature.type_id = featuretype.cvterm_id)
JOIN cvterm AS srcfeaturetype ON (srcfeature.type_id = srcfeaturetype.cvterm_id)
WHERE srcfeature.uniquename = '3R' 
AND srcfeaturetype.cvterm_id = 210
AND organism.genus = 'Drosophila'
AND organism.species = 'melanogaster'
AND feature.organism_id = srcfeature.organism_id
AND featureloc.fmin <= 21809000
AND featureloc.fmax >= 21809000
AND featuretype.cvterm_id = 219
AND feature.is_analysis = false
AND feature.is_obsolete = false 
;

... you get dramatically better performance, e.g. 266ms.

Run on 2009-05-08 at ~08:50 ... 912, 272, 272, 272, 272, 268, 273, 272, 270, 270

Comparison

FlyBase Chado Relational Database OpenFlyData SPARQL Endpoint
Shortest (s) 0.27 18
Longest (s) 0.91 23
Average (s) 0.34 19.7
Standard deviation 0.20 1.8


Sign in to add a comment
Hosted by Google Code