Search

Exploring a rose garden with SPARQL

*Philippe Rocca-Serra (philippe.rocca-serra[at]oerc.ox.ac.uk), University of Oxford e-Research Centre

import rdflib
from rdflib import Graph, RDF
from IPython.core.display import display, HTML
import os
import json
import csv
import uuid

from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, JSONLD, CSV, TSV, N3, RDF, RDFXML, TURTLE
import pandas as pds
import itertools

import numpy as np
from plotnine import *
def queryResultToHTMLTable(queryResult):
    
   HTMLResult = '<table><tr style="color:white;background-color:#43BFC7;font-weight:bold">'
   # print variable names and build header:
   for varName in queryResult.vars:
       HTMLResult = HTMLResult + '<td>' + varName + '</td>'
   HTMLResult = HTMLResult + '</tr>'

   # print values from each row and build table of results
   for row in queryResult:
      HTMLResult = HTMLResult + '<tr>'   
      for column in row:
        #print("COLUMN:", column)
        if column is not "":
             HTMLResult = HTMLResult + '<td>' +  str(column) + '</td>'
        else:
             HTMLResult = HTMLResult + '<td>' + "N/A"+ '</td>'
      HTMLResult = HTMLResult + '</tr>'
   HTMLResult = HTMLResult + '</table>'
   display(HTML(HTMLResult))
def get_sparql_variables(results, sparql_wrapper="SPARQLWrapper2"):
#     return results.vars if ("sparqlwrapper2" == sparql_wrapper.lower()) else results['head']['vars']
    return results.vars if ("sparqlwrapper2" == sparql_wrapper.lower()) else results.vars


def get_sparql_bindings(results, sparql_wrapper="SPARQLWrapper2"):
    return results.bindings if ("sparqlwrapper2" == sparql_wrapper.lower()) else results['results']['bindings']


def get_sparql_binding_variable_value(binding, variable, sparql_wrapper="SPARQLWrapper2"):
    return binding[variable] if ("sparqlwrapper2" == sparql_wrapper.lower()) else binding[variable]['value']

def make_sparql_dict_list(bindings, variables, sparql_wrapper="SPARQLWrapper2"):
    def binding_value(binding, var): # helper function for returning values
        return get_sparql_binding_variable_value(binding, var, sparql_wrapper) if (var in binding) else None

    dict_list = []  # list to contain dictionaries
    for binding in itertools.chain(bindings):
        values = [binding_value(binding, var) for var in itertools.chain(variables)]
        dict_list.append(dict(zip(variables, values)))

    return dict_list


def make_sparql_df(results, sparql_wrapper="SPARQLWrapper2"):
    variables = get_sparql_variables(results, sparql_wrapper)
    # print(variables)

    cleaned_variables=[str(var.replace('\\n','')) for var in variables] 

    #print(cleaned_variables)
    bindings = get_sparql_bindings(results, sparql_wrapper)
    # print(bindings)
    
    # create a list of dictionaries to use as data for dataframe
    data_list = make_sparql_dict_list(bindings, cleaned_variables, sparql_wrapper)
    
    # print(data_list)

    df = pds.DataFrame(data_list) # create dataframe from data list
    df["sample_mean"] = df["sample_mean"].astype("float")

   # print(df["sample_mean"])
    return df[cleaned_variables] # return dataframe with columns reordered

*credits to Bob du Charme for the following function [http://www.snee.com/bobdc.blog/2016/07/sparql-in-a-jupyter-aka-ipytho.html]

g = Graph()

Let's read the RDF graph generated using the rose-dtpkg2rdf.py python script and saved to disk as a turtle file

#g.parse("./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-subset.ttl", format="n3")
g.parse("./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-full.ttl", format="n3")
# g.parse("./../data/processed/denovo/rdf/rose-aroma-ng-06-2018-full.ttl", format="n3")
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-5-25e498617ce4> in <module>
      1 #g.parse("./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-subset.ttl", format="n3")
----> 2 g.parse("./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-full.ttl", format="n3")
      3 # g.parse("./../data/processed/denovo/rdf/rose-aroma-ng-06-2018-full.ttl", format="n3")

~/.pyenv/versions/venv372/lib/python3.7/site-packages/rdflib/graph.py in parse(self, source, publicID, format, location, file, data, **args)
   1032         source = create_input_source(source=source, publicID=publicID,
   1033                                      location=location, file=file,
-> 1034                                      data=data, format=format)
   1035         if format is None:
   1036             format = source.content_type

~/.pyenv/versions/venv372/lib/python3.7/site-packages/rdflib/parser.py in create_input_source(source, publicID, location, file, data, format)
    182         if absolute_location.startswith("file:///"):
    183             filename = url2pathname(absolute_location.replace("file:///", "/"))
--> 184             file = open(filename, "rb")
    185         else:
    186             input_source = URLInputSource(absolute_location, format)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/philippe/Documents/git/FAIRplus-org/the-fair-cookbook/docs/content/recipes/applied-examples/data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-full.ttl'

Now let's ask for the independent variables and their levels using the following SPARQL query

get_idv_and_levels = g.query("""
PREFIX stato: <http://purl.obolibrary.org/obo/STATO_>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
PREFIX ncbitax: <http://purl.obolibrary.org/obo/NCBITaxon_>
prefix has_part: <http://purl.obolibrary.org/obo/BFO_0000051>
            SELECT DISTINCT
             ?Predictor
             ?PredictorLevel
             WHERE { 
                ?var a stato:0000087 ;
                    rdfs:label ?Predictor ;
                    has_part: ?value .
                ?value rdfs:label ?PredictorLevel .    
                 }               
""")

We can display the results of that query using the function declared earlier on.

queryResultToHTMLTable(get_idv_and_levels)
PredictorPredictorLevel

Let's now ask for the number of biological and technical replicates used to compute the mean concentration of the chemical compounds detected and forming the signature of the rose fragrance.

get_replication_info = g.query("""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
prefix chmo:   <http://purl.obolibrary.org/obo/CHMO_> 
prefix msio:   <http://purl.obolibrary.org/obo/MSIO_> 
prefix stato: <http://purl.obolibrary.org/obo/STATO_> 
prefix obi: <http://purl.obolibrary.org/obo/OBI_> 
prefix ro: <http://purl.obolibrary.org/obo/RO_>
prefix po: <http://purl.obolibrary.org/obo/PO_>
prefix has_member: <http://purl.obolibrary.org/obo/RO_0002351>
prefix has_value: <http://purl.obolibrary.org/obo/STATO_0000129> 
prefix computed_from: <http://purl.obolibrary.org/obo/STATO_0000557>
prefix has_specified_input: <http://purl.obolibrary.org/obo/OBI_0000293>
prefix is_specified_output_of: <http://purl.obolibrary.org/obo/OBI_0000295>
prefix is_about: <http://purl.obolibrary.org/obo/IAO_0000136>

SELECT        
      ?TreatmentGroup 
      ?ChemicalCompound
      ?MeanConcentration
      (count(distinct ?member) as ?NbTechnicalReplicate) 
      (count(distinct ?input) as ?NbBiologicalReplicate)      
      WHERE {
            ?population a stato:0000193 ;
                rdfs:label ?TreatmentGroup ;
                has_member: ?member .      
            ?member has_specified_input: ?input .              
            ?mean a stato:0000402 ;
                computed_from: ?population ;
                has_value: ?MeanConcentration ;
                is_about: ?ChemicalCompound .
            ?concentration a stato:0000072;
                is_about: ?ChemicalCompound .
}
      GROUP BY ?population 
""")

Once more, we invoked the pretty printing function:

queryResultToHTMLTable(get_replication_info)
TreatmentGroupChemicalCompoundMeanConcentrationNbTechnicalReplicateNbBiologicalReplicate
get_all_data = g.query("""
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
prefix chmo:   <http://purl.obolibrary.org/obo/CHMO_> 
prefix msio:   <http://purl.obolibrary.org/obo/MSIO_> 
prefix stato: <http://purl.obolibrary.org/obo/STATO_> 
prefix obi: <http://purl.obolibrary.org/obo/OBI_> 
prefix ro: <http://purl.obolibrary.org/obo/RO_>
prefix po: <http://purl.obolibrary.org/obo/PO_>
prefix chebi: <http://purl.obolibrary.org/obo/CHEBI_>
prefix has_value: <http://purl.obolibrary.org/obo/STATO_0000129>
prefix computed_from: <http://purl.obolibrary.org/obo/STATO_0000557>
prefix is_about: <http://purl.obolibrary.org/obo/IAO_0000136>
prefix is_denoted_by: <http://purl.obolibrary.org/obo/STATO_0000205>
prefix derives_from: <http://purl.obolibrary.org/obo/RO_0001000> 
prefix located_in: <http://purl.obolibrary.org/obo/RO_0001025>
prefix denotes: <http://purl.obolibrary.org/obo/IAO_0000219>
prefix measured_in: <http://purl.obolibrary.org/obo/RO_0002351> 

SELECT REDUCED  ?chemical_name ?chebi_identifier ?inchi ?sample_mean ?sem ?treatment ?genotype ?organism_part
WHERE {
        ?pop_mean a stato:0000402 ;
         is_about: ?chebi_identifier ;
         computed_from: ?population ;
         has_value: ?sample_mean .
     ?chem a ?chebi_identifier ;
         rdfs:label ?chemical_name ;
         is_denoted_by: ?inchi .
     ?semv a stato:0000037 ; 
         denotes: ?pop_mean ;
         has_value: ?sem.
     ?population a stato:0000193 ;
         rdfs:label ?treatment .
     ?sub_conc a stato:0000072 ;
         derives_from: ?genotype ;
         located_in: ?organism_part;
         measured_in: ?population .         
}        
""")
queryResultToHTMLTable(get_all_data)
chemical_namechebi_identifierinchisample_meansemtreatmentgenotypeorganism_part
data=make_sparql_df(get_all_data)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/.pyenv/versions/venv372/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2896             try:
-> 2897                 return self._engine.get_loc(key)
   2898             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'sample_mean'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-12-74226eea9bba> in <module>
----> 1 data=make_sparql_df(get_all_data)

<ipython-input-3-e97086bf9422> in make_sparql_df(results, sparql_wrapper)
     39 
     40     df = pds.DataFrame(data_list) # create dataframe from data list
---> 41     df["sample_mean"] = df["sample_mean"].astype("float")
     42 
     43    # print(df["sample_mean"])

~/.pyenv/versions/venv372/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2993             if self.columns.nlevels > 1:
   2994                 return self._getitem_multilevel(key)
-> 2995             indexer = self.columns.get_loc(key)
   2996             if is_integer(indexer):
   2997                 indexer = [indexer]

~/.pyenv/versions/venv372/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2897                 return self._engine.get_loc(key)
   2898             except KeyError:
-> 2899                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2900         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2901         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'sample_mean'
# width = figure_size[0]
# height = figure_size[0] * aspect_ratio
gray = '#666666'
orange = '#FF8000'
blue = '#3333FF'

p1 = (ggplot(data)
 + aes('chemical_name','sample_mean',fill='factor(treatment)')
 + geom_col()
+ facet_wrap('~treatment', dir='v',ncol=1)
 + scale_y_continuous(expand = (0,0))
 + theme(axis_text_x=element_text(rotation=90, hjust=1, fontsize=6, color=blue))
 + theme(axis_text_y=element_text(rotation=0, hjust=2, fontsize=6, color=orange))
         + theme(figure_size = (8, 16))
)

p1 + theme(panel_background=element_rect(fill=blue)
       )

p1
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-50ce0a623536> in <module>
     12  + theme(axis_text_x=element_text(rotation=90, hjust=1, fontsize=6, color=blue))
     13  + theme(axis_text_y=element_text(rotation=0, hjust=2, fontsize=6, color=orange))
---> 14          + theme(figure_size = (8, 16))
     15 )
     16 

NameError: name 'data' is not defined