Changeset: eb32228c325e for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=eb32228c325e
Modified Files:
        monetdb5/extras/rdf/rdf.h
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
        monetdb5/extras/rdf/rdfontologyload.c
        monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

Store oids instead of strings to improve performance
Store oids during the labeling process, transform them into strings for export 
only. URI string format: <http://xxxxxxxx>/


diffs (truncated from 2090 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -105,13 +105,13 @@ typedef enum {
 
 // Final data structure that stores the labels for tables and attributes
 typedef struct CSlabel {
-       str             name;           // table name
-       str             *candidates;    // list of table name candidates, 
candidates[0] == name
+       oid             name;           // table name
+       oid             *candidates;    // list of table name candidates, 
candidates[0] == name
        int             candidatesCount;// number of entries in the candidates 
list
-       str             *hierarchy;     // hierarchy "bottom to top"
+       oid             *hierarchy;     // hierarchy "bottom to top"
        int             hierarchyCount; // number of entries in the hierarchy 
list
        int             numProp;        // number of properties, copied from 
freqCSset->items[x].numProp
-       char            **lstProp;      // attribute names (same order as in 
freqCSset->items[x].lstProp)
+       oid             *lstProp;       // attribute names (same order as in 
freqCSset->items[x].lstProp)
 } CSlabel;
 
 #endif /* _RDF_H_ */
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -27,79 +27,79 @@
 // list of known ontologies
 int ontologyCount = 73;
 ontology ontologies[] = {
-{{"http:", "www.facebook.com", "2008"}, 3},
-{{"http:", "facebook.com", "2008"}, 3},
-{{"http:", "developers.facebook.com", "schema"}, 3},
-{{"https:", "www.facebook.com", "2008"}, 3},
-{{"http:", "purl.org", "dc", "elements", "1.1"}, 5}, // dc DublinCore
-{{"http:", "purl.org", "dc", "terms"}, 4}, // DublinCore
-{{"http:", "purl.org", "goodrelations", "v1"}, 4}, // GoodRelations
-{{"http:", "purl.org", "rss", "1.0", "modules"}, 5},
-{{"http:", "purl.org", "stuff"}, 3},
-{{"http:", "www.purl.org", "stuff"}, 3},
-{{"http:", "ogp.me", "ns"}, 3},
-{{"https:", "ogp.me", "ns"}, 3},
-{{"http:", "www.w3.org", "1999", "02", "22-rdf-syntax-ns"}, 5}, // rdf
-{{"http:", "www.w3.org", "2000", "01", "rdf-schema"}, 5}, // rdfs
-{{"http:", "www.w3.org", "2004", "02", "skos", "core"}, 6}, // skos (Simple 
Knowledge Organization System)
-{{"http:", "www.w3.org", "2002", "07", "owl"}, 5},
-{{"http:", "www.w3.org", "2006", "vcard", "ns"}, 5}, // vcard
-{{"http:", "www.w3.org", "2001", "vcard-rdf", "3.0"}, 5},
-{{"http:", "www.w3.org", "2003", "01", "geo", "wgs84_pos"}, 6}, // geo
-{{"http:", "www.w3.org", "1999", "xhtml", "vocab"}, 5}, // xhtml
-{{"http:", "search.yahoo.com", "searchmonkey"}, 3},
-{{"https:", "search.yahoo.com", "searchmonkey"}, 3},
-{{"http:", "search.yahoo.co.jp", "searchmonkey"}, 3},
-{{"http:", "g.yahoo.com", "searchmonkey"}, 3},
-{{"http:", "opengraphprotocol.org", "schema"}, 3},
-{{"https:", "opengraphprotocol.org", "schema"}, 3},
-{{"http:", "opengraph.org", "schema"}, 3},
-{{"https:", "opengraph.org", "schema"}, 3},
-{{"http:", "creativecommons.org", "ns"}, 3}, // cc
-{{"http:", "rdf.data-vocabulary.org"}, 2}, // by google
-{{"http:", "rdfs.org", "sioc", "ns"}, 4}, // sioc (pronounced "shock", 
Semantically-Interlinked Online Communities Project)
-{{"http:", "xmlns.com", "foaf", "0.1"}, 4}, // foaf (Friend of a Friend)
-{{"http:", "mixi-platform.com", "ns"}, 3}, // japanese social graph
-{{"http:", "commontag.org", "ns"}, 3},
-{{"http:", "semsl.org", "ontology"}, 3}, // semantic web for second life
-{{"http:", "schema.org"}, 2},
-{{"http:", "openelectiondata.org", "0.1"}, 3},
-{{"http:", "search.aol.com", "rdf"}, 3},
-{{"http:", "www.loc.gov", "loc.terms", "relators"}, 4}, // library of congress
-{{"http:", "dbpedia.org", "ontology"}, 3}, // dbo
-{{"http:", "dbpedia.org", "resource"}, 3}, // dbpedia
-{{"http:", "dbpedia.org", "property"}, 3}, // dbp
-{{"http:", "www.aktors.org", "ontology", "portal"}, 4}, // akt (research, 
publications, ...)
-{{"http:", "purl.org", "ontology", "bibo"}, 4}, // bibo (bibliography)
-{{"http:", "purl.org", "ontology", "mo"}, 4}, // mo (music)
-{{"http:", "www.geonames.org", "ontology"}, 3}, // geonames
-{{"http:", "purl.org", "vocab", "frbr", "core"}, 5}, // frbr (Functional 
Requirements for Bibliographic Records)
-{{"http:", "www.w3.org", "2001", "XMLSchema"}, 4}, // xsd
-{{"http:", "www.w3.org", "2006", "time"}, 4}, // time
-{{"http:", "purl.org", "NET", "c4dm", "event.owl"}, 5}, // event
-{{"http:", "www.openarchives.org", "ore", "terms"}, 4}, // ore (Open Archive)
-{{"http:", "purl.org", "vocab", "bio", "0.1"}, 5}, // bio (biographical data)
-{{"http:", "www.holygoat.co.uk", "owl", "redwood", "0.1", "tags"}, 6}, // tag
-{{"http:", "rdfs.org", "ns", "void"}, 4}, // void (Vocabulary of Interlinked 
Datasets)
-{{"http:", "www.w3.org", "2006", "http"}, 4}, // http
-{{"http:", "purl.uniprot.org", "core"}, 3}, // uniprot (protein annotation)
-{{"http:", "umbel.org", "umbel"}, 3}, // umbel (Upper Mapping and Binding 
Exchange Layer)
-{{"http:", "purl.org", "stuff", "rev"}, 4}, // rev (review)
-{{"http:", "purl.org", "linked-data", "cube"}, 4}, // qb (data cube)
-{{"http:", "www.w3.org", "ns", "org"}, 4}, // org (organizations)
-{{"http:", "purl.org", "vocab", "vann"}, 4}, // vann (vocabulary for 
annotating vocabulary descriptions)
-{{"http:", "data.ordnancesurvey.co.uk", "ontology", "admingeo"}, 4}, // 
admingeo (administrative geography and civil voting area)
-{{"http:", "www.w3.org", "2007", "05", "powder-s"}, 5}, // wdrs (Web 
Description Resources)
-{{"http:", "usefulinc.com", "ns", "doap"}, 4}, // doap (Description of a 
Project)
-{{"http:", "lod.taxonconcept.org", "ontology", "txn.owl"}, 4}, // txn 
(TaxonConcept, species)
-{{"http:", "xmlns.com", "wot", "0.1"}, 4}, // wot (Web Of Trust)
-{{"http:", "purl.org", "net", "compass"}, 4}, // compass
-{{"http:", "www.w3.org", "2004", "03", "trix", "rdfg-1"}, 6}, // rdfg (RDF 
graph)
-{{"http:", "purl.org", "NET", "c4dm", "timeline.owl"}, 5}, // tl (timeline)
-{{"http:", "purl.org", "dc", "dcam"}, 4}, // dcam (DublinCore metadata)
-{{"http:", "swrc.ontoware.org", "ontology"}, 3}, // swrc (university, research)
-{{"http:", "zeitkunst.org", "bibtex", "0.1", "bibtex.owl"}, 5}, // bib (bibTeX 
entries)
-{{"http:", "purl.org", "ontology", "po"}, 4} // po (tv and radio programmes)
+{{"<http:", "www.facebook.com", "2008"}, 3},
+{{"<http:", "facebook.com", "2008"}, 3},
+{{"<http:", "developers.facebook.com", "schema"}, 3},
+{{"<https:", "www.facebook.com", "2008"}, 3},
+{{"<http:", "purl.org", "dc", "elements", "1.1"}, 5}, // dc DublinCore
+{{"<http:", "purl.org", "dc", "terms"}, 4}, // DublinCore
+{{"<http:", "purl.org", "goodrelations", "v1"}, 4}, // GoodRelations
+{{"<http:", "purl.org", "rss", "1.0", "modules"}, 5},
+{{"<http:", "purl.org", "stuff"}, 3},
+{{"<http:", "www.purl.org", "stuff"}, 3},
+{{"<http:", "ogp.me", "ns"}, 3},
+{{"<https:", "ogp.me", "ns"}, 3},
+{{"<http:", "www.w3.org", "1999", "02", "22-rdf-syntax-ns"}, 5}, // rdf
+{{"<http:", "www.w3.org", "2000", "01", "rdf-schema"}, 5}, // rdfs
+{{"<http:", "www.w3.org", "2004", "02", "skos", "core"}, 6}, // skos (Simple 
Knowledge Organization System)
+{{"<http:", "www.w3.org", "2002", "07", "owl"}, 5},
+{{"<http:", "www.w3.org", "2006", "vcard", "ns"}, 5}, // vcard
+{{"<http:", "www.w3.org", "2001", "vcard-rdf", "3.0"}, 5},
+{{"<http:", "www.w3.org", "2003", "01", "geo", "wgs84_pos"}, 6}, // geo
+{{"<http:", "www.w3.org", "1999", "xhtml", "vocab"}, 5}, // xhtml
+{{"<http:", "search.yahoo.com", "searchmonkey"}, 3},
+{{"<https:", "search.yahoo.com", "searchmonkey"}, 3},
+{{"<http:", "search.yahoo.co.jp", "searchmonkey"}, 3},
+{{"<http:", "g.yahoo.com", "searchmonkey"}, 3},
+{{"<http:", "opengraphprotocol.org", "schema"}, 3},
+{{"<https:", "opengraphprotocol.org", "schema"}, 3},
+{{"<http:", "opengraph.org", "schema"}, 3},
+{{"<https:", "opengraph.org", "schema"}, 3},
+{{"<http:", "creativecommons.org", "ns"}, 3}, // cc
+{{"<http:", "rdf.data-vocabulary.org"}, 2}, // by google
+{{"<http:", "rdfs.org", "sioc", "ns"}, 4}, // sioc (pronounced "shock", 
Semantically-Interlinked Online Communities Project)
+{{"<http:", "xmlns.com", "foaf", "0.1"}, 4}, // foaf (Friend of a Friend)
+{{"<http:", "mixi-platform.com", "ns"}, 3}, // japanese social graph
+{{"<http:", "commontag.org", "ns"}, 3},
+{{"<http:", "semsl.org", "ontology"}, 3}, // semantic web for second life
+{{"<http:", "schema.org"}, 2},
+{{"<http:", "openelectiondata.org", "0.1"}, 3},
+{{"<http:", "search.aol.com", "rdf"}, 3},
+{{"<http:", "www.loc.gov", "loc.terms", "relators"}, 4}, // library of congress
+{{"<http:", "dbpedia.org", "ontology"}, 3}, // dbo
+{{"<http:", "dbpedia.org", "resource"}, 3}, // dbpedia
+{{"<http:", "dbpedia.org", "property"}, 3}, // dbp
+{{"<http:", "www.aktors.org", "ontology", "portal"}, 4}, // akt (research, 
publications, ...)
+{{"<http:", "purl.org", "ontology", "bibo"}, 4}, // bibo (bibliography)
+{{"<http:", "purl.org", "ontology", "mo"}, 4}, // mo (music)
+{{"<http:", "www.geonames.org", "ontology"}, 3}, // geonames
+{{"<http:", "purl.org", "vocab", "frbr", "core"}, 5}, // frbr (Functional 
Requirements for Bibliographic Records)
+{{"<http:", "www.w3.org", "2001", "XMLSchema"}, 4}, // xsd
+{{"<http:", "www.w3.org", "2006", "time"}, 4}, // time
+{{"<http:", "purl.org", "NET", "c4dm", "event.owl"}, 5}, // event
+{{"<http:", "www.openarchives.org", "ore", "terms"}, 4}, // ore (Open Archive)
+{{"<http:", "purl.org", "vocab", "bio", "0.1"}, 5}, // bio (biographical data)
+{{"<http:", "www.holygoat.co.uk", "owl", "redwood", "0.1", "tags"}, 6}, // tag
+{{"<http:", "rdfs.org", "ns", "void"}, 4}, // void (Vocabulary of Interlinked 
Datasets)
+{{"<http:", "www.w3.org", "2006", "http"}, 4}, // http
+{{"<http:", "purl.uniprot.org", "core"}, 3}, // uniprot (protein annotation)
+{{"<http:", "umbel.org", "umbel"}, 3}, // umbel (Upper Mapping and Binding 
Exchange Layer)
+{{"<http:", "purl.org", "stuff", "rev"}, 4}, // rev (review)
+{{"<http:", "purl.org", "linked-data", "cube"}, 4}, // qb (data cube)
+{{"<http:", "www.w3.org", "ns", "org"}, 4}, // org (organizations)
+{{"<http:", "purl.org", "vocab", "vann"}, 4}, // vann (vocabulary for 
annotating vocabulary descriptions)
+{{"<http:", "data.ordnancesurvey.co.uk", "ontology", "admingeo"}, 4}, // 
admingeo (administrative geography and civil voting area)
+{{"<http:", "www.w3.org", "2007", "05", "powder-s"}, 5}, // wdrs (Web 
Description Resources)
+{{"<http:", "usefulinc.com", "ns", "doap"}, 4}, // doap (Description of a 
Project)
+{{"<http:", "lod.taxonconcept.org", "ontology", "txn.owl"}, 4}, // txn 
(TaxonConcept, species)
+{{"<http:", "xmlns.com", "wot", "0.1"}, 4}, // wot (Web Of Trust)
+{{"<http:", "purl.org", "net", "compass"}, 4}, // compass
+{{"<http:", "www.w3.org", "2004", "03", "trix", "rdfg-1"}, 6}, // rdfg (RDF 
graph)
+{{"<http:", "purl.org", "NET", "c4dm", "timeline.owl"}, 5}, // tl (timeline)
+{{"<http:", "purl.org", "dc", "dcam"}, 4}, // dcam (DublinCore metadata)
+{{"<http:", "swrc.ontoware.org", "ontology"}, 3}, // swrc (university, 
research)
+{{"<http:", "zeitkunst.org", "bibtex", "0.1", "bibtex.owl"}, 5}, // bib 
(bibTeX entries)
+{{"<http:", "purl.org", "ontology", "po"}, 4} // po (tv and radio programmes)
 };
 
 #if USE_SHORT_NAMES
@@ -107,25 +107,25 @@ ontology ontologies[] = {
 static
 void getPropNameShort(char** name, char* propStr) {
        char            *token;
-       char            *uri, *uriPtr;
+       char            *uri;
+       char            *uriPtr;
        int             length = 0;             // number of tokens
        char            **tokenizedUri = NULL;  // list of tokens
        int             i, j;
        int             fit;
 
        // tokenize uri
-       uri = (char *) malloc(sizeof(char) * (strlen(propStr) + 1));
+       uri = (char *) GDKmalloc(sizeof(char) * (strlen(propStr) + 1));
        if (!uri) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
        strcpy(uri, propStr); // uri will be modified during tokenization
        uriPtr = uri; // uri will be modified, uriPtr keeps original pointer
        token = strtok(uri, "/#");
        while (token != NULL) {
-               tokenizedUri = realloc(tokenizedUri, sizeof(char*) * ++length);
+               tokenizedUri = GDKrealloc(tokenizedUri, sizeof(char*) * 
++length);
                if (!tokenizedUri) fprintf(stderr, "ERROR: Couldn't realloc 
memory!\n");
                tokenizedUri[length - 1] = token;
                token = strtok(NULL, "/#");
        }
-       free(uriPtr);
 
        // match with ontologies
        for (j = 0; j < ontologyCount; ++j) {
@@ -142,7 +142,7 @@ void getPropNameShort(char** name, char*
                                for (i = ontologies[j].length; i < length; ++i) 
{
                                        totalLength += (strlen(tokenizedUri[i]) 
+ 1); // additional char for underscore
                                }
-                               (*name) = (char *) malloc(sizeof(char) * 
(totalLength + 1));
+                               (*name) = (char *) GDKmalloc(sizeof(char) * 
(totalLength + 1));
                                if (!(*name)) fprintf(stderr, "ERROR: Couldn't 
malloc memory!\n");
                                strcpy(*name, "\0");
 
@@ -153,7 +153,10 @@ void getPropNameShort(char** name, char*
                                // remove trailing underscore
                                (*name)[strlen(*name) - 1] = '\0';
 
-                               free(tokenizedUri);
+                               if ((*name)[strlen(*name) - 1] == '>') 
(*name)[strlen(*name) - 1] = '\0'; // remove >
+
+                               GDKfree(tokenizedUri);
+                               GDKfree(uriPtr);
                                return;
                        }
                }
@@ -163,16 +166,19 @@ void getPropNameShort(char** name, char*
 
        if (length <= 1) {
                // value
-               (*name) = (char *) malloc(sizeof(char) * (strlen(propStr) + 1));
+               (*name) = (char *) GDKmalloc(sizeof(char) * (strlen(propStr) + 
1));
                if (!(*name)) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
                strcpy(*name, propStr);
        } else {
-               (*name) = (char *) malloc(sizeof(char) * 
(strlen(tokenizedUri[length - 1]) + 1));
+               (*name) = (char *) GDKmalloc(sizeof(char) * 
(strlen(tokenizedUri[length - 1]) + 1));
                if (!(*name)) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
                strcpy(*name, tokenizedUri[length - 1]);
        }
 
-       free(tokenizedUri);
+       if ((*name)[strlen(*name) - 1] == '>') (*name)[strlen(*name) - 1] = 
'\0'; // remove >
+
+       GDKfree(tokenizedUri);
+       GDKfree(uriPtr);
        return;
 }
 #endif
@@ -238,11 +244,6 @@ Relation*** initRelationMetadata(int** r
        int             i, j, k;
        Relation***     relationMetadata;
 
-       int             ret;
-       char*           schema = "rdf";
-
-       TKNZRopen (NULL, &schema);
-
        relationMetadata = (Relation ***) malloc(sizeof(Relation **) * 
freqCSset->numCSadded);
        if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
        for (i = 0; i < num; ++i) { // CS
@@ -293,8 +294,6 @@ Relation*** initRelationMetadata(int** r
                }
        }
 
-       TKNZRclose(&ret);
-
        return relationMetadata;
 }
 
@@ -304,11 +303,6 @@ Relation*** initRelationMetadata2(int** 
        int             i, j, k;
        Relation***     relationMetadata;
 
-       int             ret;
-       char*           schema = "rdf";
-
-       TKNZRopen (NULL, &schema);
-
        relationMetadata = (Relation ***) malloc(sizeof(Relation **) * 
freqCSset->numCSadded);
        if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
        for (i = 0; i < freqCSset->numCSadded; ++i) { // CS
@@ -360,8 +354,6 @@ Relation*** initRelationMetadata2(int** 
                }
        }
 
-       TKNZRclose(&ret);
-
        return relationMetadata;
 }
 
@@ -439,7 +431,7 @@ void escapeURIforSQL(char* s) {
        int i;
 
        for (i = 0; i < (int) strlen(s); ++i) {
-               if (s[i] == ':' || s[i] == '"' || s[i] == ' ' || s[i] == '-') 
s[i] = '_';
+               if (s[i] == ':' || s[i] == '"' || s[i] == ' ' || s[i] == '-' || 
s[i] == '<' || s[i] == '>' || s[i] == '/' || s[i] == '(' || s[i] == ')' || s[i] 
== '.' || s[i] == '%') s[i] = '_';
                s[i] = tolower(s[i]);
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to