MonetDB: rdf - add support for not using labels in merging phase...
Changeset: 480cd88defb8 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=480cd88defb8 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: add support for not using labels in merging phase (not enabled) diffs (140 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -3585,6 +3585,7 @@ void generatecsRelSum(CSrel csRel, int f } +#if USE_LABEL_FOR_MERGING static LabelStat* initLabelStat(void){ LabelStat *labelStat = (LabelStat*) malloc(sizeof(LabelStat)); @@ -3603,10 +3604,12 @@ LabelStat* initLabelStat(void){ return labelStat; } +#endif /* * * */ +#if USE_LABEL_FOR_MERGING #if USE_ALTERNATIVE_NAME static oid getMostSuitableName(CSlabel *labels, int freqIdx, int candIdx){ @@ -3641,6 +3644,7 @@ oid getMostSuitableName(CSlabel *labels, } #endif +#endif #if DETECT_INCORRECT_TYPE_SUBJECT @@ -3801,6 +3805,7 @@ void buildLabelStatForFinalMergeCS(Label #endif +#if USE_LABEL_FOR_MERGING static void buildLabelStat(LabelStat *labelStat, CSlabel *labels, CSset *freqCSset, int k){ int i,j; @@ -3887,7 +3892,9 @@ void buildLabelStat(LabelStat *labelStat } } - +#endif + +#if USE_LABEL_FOR_MERGING static void freeLabelStat(LabelStat *labelStat){ int i; @@ -3901,6 +3908,7 @@ void freeLabelStat(LabelStat *labelStat) BBPreclaim(labelStat-labelBat); free(labelStat); } +#endif static void doMerge(CSset *freqCSset, int ruleNum, int freqId1, int freqId2, oid *mergecsId, CSlabel** labels, oid** ontmetadata, int ontmetadataCount, oid name, int isType, int isOntology, int isFK){ @@ -3954,6 +3962,7 @@ void doMerge(CSset *freqCSset, int ruleN } +#if USE_LABEL_FOR_MERGING static str mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId, oid** ontmetadata, int ontmetadataCount,bat *mapbatid){ int i, j; @@ -4188,6 +4197,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, return MAL_SUCCEED; } +#endif static void mergeMaxFreqCSByS5(CSrel *csrelMergeFreqSet, CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId, oid** ontmetadata, int ontmetadataCount){ @@ -4319,7 +4329,7 @@ void mergeMaxFreqCSByS5(CSrel *csrelMerg } - +#if USE_LABEL_FOR_MERGING static char isSemanticSimilar(int freqId1, int freqId2, CSlabel* labels, OntoUsageNode *tree, int numOrigFreqCS, oid *ancestor, BAT *ontmetaBat, OntClass *ontclassSet){ /*Rule S1 S2 S3*/ int i, j; @@ -4433,6 +4443,7 @@ char isSemanticSimilar(int freqId1, int return 0; } +#endif static void initTFIDFInfos(TFIDFInfo *tfidfInfos, int curNumMergeCS, oid* mergeCSFreqCSMap, CSset *freqCSset, PropStat *propStat){ @@ -4476,6 +4487,7 @@ void freeTFIDFInfo(TFIDFInfo *tfidfInfos free(tfidfInfos); } +#if USE_LABEL_FOR_MERGING static void mergeCSByS2(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid **ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet){ int i, j; @@ -4517,6 +4529,7 @@ void mergeCSByS2(CSset *freqCSset, CSlab } } +#endif static void mergeCSByS4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId,oid **ontmetadata, int ontmetadataCount){ @@ -9090,6 +9103,7 @@ RDFextractCSwithTypes(int *ret, bat *sba curNumMergeCS = countNumberMergeCS(freqCSset); printf(Before using rules: Number of freqCS is: %d \n,curNumMergeCS); +#if USE_LABEL_FOR_MERGING /* -- S1 --- */ mergecsId = *maxCSoid + 1; @@ -9109,6 +9123,7 @@ RDFextractCSwithTypes(int *ret, bat *sba computeMetricsQ(freqCSset); #endif tmpLastT = curT; +#endif /* -- S3 --- */ mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS); @@ -9159,6 +9174,7 @@ RDFextractCSwithTypes(int *ret, bat *sba tmpLastT = curT; +#if USE_LABEL_FOR_MERGING //S2: Common ancestor free(mergeCSFreqCSMap); mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS); @@ -9179,6 +9195,7 @@ RDFextractCSwithTypes(int *ret, bat *sba #endif tmpLastT = curT; +#endif //S4: TF/IDF similarity ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - implement USE_LABEL_FINDING_MAXCS (but do not ena...
Changeset: ab84eb43b2d9 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ab84eb43b2d9 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: implement USE_LABEL_FINDING_MAXCS (but do not enable it) diffs (175 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -2970,12 +2970,68 @@ void updateParentIdxAll(CSset *freqCSset } } +#if USE_LABEL_FINDING_MAXCS +/* + * * Return 1 if there is semantic evidence against merging the two CS's, this is the case iff the two CS's have a hierarchy and their common ancestor is too generic (support above IMPORTANCE_THRESHOLD). + * */ +static +char isEvidenceAgainstMerging(int freqId1, int freqId2, CSlabel* labels, OntoUsageNode *tree) { + int i, j; + int level; + OntoUsageNode *tmpNode; + + // Get common ancestor + int hCount1 = labels[freqId1].hierarchyCount; + int hCount2 = labels[freqId2].hierarchyCount; + int minCount = (hCount1 hCount2)?hCount2:hCount1; + + if (minCount == 0) { + // at least one CS does not have a hierarchy -- no semantic information -- no semantic evidence against merging + return 0; + } + + // get level where the hierarchies differ + for (i = 0; i minCount; i++){ + if (labels[freqId1].hierarchy[hCount1-1-i] != labels[freqId2].hierarchy[hCount2-1-i]) break; + } + + if (i == 0) { + // not even the top level of the hierarchy is the same -- there is semantic evidence against merging the two CS's + return 1; + } else if (i == minCount) { + // same name -- no semantic evidence against merging + return 0; + } + + // get the common ancestor at level i + level = 0; + tmpNode = tree; + while(level i){ + for (j = 0; j tmpNode-numChildren; j++) { + if (tmpNode-lstChildren[j]-uri == labels[freqId1].hierarchy[hCount1-1-level]){ + tmpNode = tmpNode-lstChildren[j]; + break; + } + } + level++; + } + + if (tmpNode-percentage = IMPORTANCE_THRESHOLD) { + // have common ancestor but it is too generic -- there is semantic evidence against merging the two CS's + return 1; + } else { + // common ancestor is specific -- no semantic evidence against merging + return 0; + } +} +#endif + /* * Get the maximum frequent CSs from a CSset * Here maximum frequent CS is a CS that there exist no other CS which contains that CS * */ static -void mergeCSbyS3(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap, int curNumMergeCS, oid **ontmetadata, int ontmetadataCount){ +void mergeCSbyS3(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap, int curNumMergeCS, oid **ontmetadata, int ontmetadataCount, OntoUsageNode *tree){ int numMergeCS = curNumMergeCS; int i, j; @@ -2983,13 +3039,12 @@ void mergeCSbyS3(CSset *freqCSset, CSlab int tmpParentIdx; int freqId1, freqId2; - #if USE_LABEL_FINDING_MAXCS - charisLabelComparable = 0; - #endif - charisDiffLabel = 0; int numP1, numP2; CS *mergecs1, *mergecs2; - (void) labels; + +#if !USE_LABEL_FINDING_MAXCS + (void) tree; +#endif printf(Retrieving maximum frequent CSs: \n); @@ -3000,44 +3055,35 @@ void mergeCSbyS3(CSset *freqCSset, CSlab if (freqCSset-items[freqId1].type == DIMENSIONCS) continue; #endif - #if USE_LABEL_FINDING_MAXCS - isLabelComparable = 0; - if ((*labels)[i].name != BUN_NONE) isLabelComparable = 1; // no DUMMY - #endif - for (j = (i+1); j numMergeCS; j++){ freqId2 = mergeCSFreqCSMap[j]; #if NOT_MERGE_DIMENSIONCS if (freqCSset-items[freqId2].type == DIMENSIONCS) continue; #endif - isDiffLabel = 0; - #if USE_LABEL_FINDING_MAXCS - if (isLabelComparable == 0 || strcmp((*labels)[freqId1].name, (*labels)[freqId2].name) != 0) { - isDiffLabel = 1; - } - #endif - - if (isDiffLabel == 0){ - numP2 = freqCSset-items[freqId2].numProp; - numP1 = freqCSset-items[freqId1].numProp; - if (numP2 numP1 (numP2-numP1) MAX_SUB_SUPER_NUMPROP_DIF){ - if
MonetDB: rdf - add support for printing ontology tree
Changeset: 121f9fd5d239 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=121f9fd5d239 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: add support for printing ontology tree diffs (65 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -2621,18 +2621,45 @@ void addToOntoUsageTree(OntoUsageNode* t } static -void printTree(OntoUsageNode* tree, int level) { +void printTreePrivate(OntoUsageNode* tree, int level, FILE* fout) { int i; - str uriStr; - - takeOid(tree-uri, uriStr); - printf(Level %d URI %s Count %d Sum %d Percent %.1f\n, level, uriStr, tree-numOccurances, tree-numOccurancesSum, tree-percentage * 100); - + str uriStr, uriStrShort; + + if (tree-parent) { + takeOid(tree-uri, uriStr); + getPropNameShort(uriStrShort, uriStr); + fprintf(fout, BUNFMT [label = \%s (%.1f%%)\];\n, tree-uri, uriStrShort, tree-percentage * 100); + fprintf(fout, BUNFMT--BUNFMT;\n, tree-uri, tree-parent-uri); + GDKfree(uriStrShort); + GDKfree(uriStr); + } else { + // artifical root, has no name + fprintf(fout, BUNFMT [label = \ROOT (%.1f%%)\];\n, tree-uri, tree-percentage * 100); + } for (i = 0; i tree-numChildren; ++i) { - printTree(tree-lstChildren[i], level+1); + printTreePrivate(tree-lstChildren[i], level+1, fout); } } +/* + * Print ontology tree to file, dot code + */ +static +void printTree(OntoUsageNode* tree) { + FILE *fout = fopen(ontoUsageTree.txt, wt); + + // header + fprintf(fout, graph g {\n); + fprintf(fout, graph [ratio = \compress\, rankdir = \RL\];\n); + fprintf(fout, node [shape = \box\];\n\n); + // body + printTreePrivate(tree, 0, fout); + // footer + + fprintf(fout, }\n); + fclose(fout); +} + static void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, oid** ontmetadata, int ontmetadataCount, BAT *ontmetaBat,CSlabel* labels) { int i; @@ -2679,7 +2706,7 @@ void createOntoUsageTree(OntoUsageNode** // print if(0){ printf(Ontology tree:\n); - printTree(*tree, 0); + printTree(*tree); } } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - update layout of sampleDataFullRandom file
Changeset: 5f95f68cf48b for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=5f95f68cf48b Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: update layout of sampleDataFullRandom file diffs (54 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6848,12 +6848,13 @@ str printFullSampleData(CSSampleExtend * fouttb = fopen(filename2,wt); foutis = fopen(filename3,wt); + fprintf(foutrand, Table|Name|Rating\n); for (i = 0; i num; i++){ sample = csSampleEx[i]; if ((int)sample.candidateCount == 1 sample.candidates[0] == BUN_NONE) continue; // do not print tables withoud candidates freqCS = freqCSset-items[sample.freqIdx]; fprintf(fout,Table %d, %d tuples\n, i, freqCS.support); - fprintf(foutrand,Table %d, %d tuples\n, i, freqCS.support); + fprintf(foutrand,Table %d, %d tuples, i, freqCS.support); fprintf(foutsol, Table %d\n, i); for (j = 0; j (int)sample.candidateCount; j++){ //fprintf(fout,BUNFMT,sample.candidates[j]); @@ -6865,12 +6866,10 @@ str printFullSampleData(CSSampleExtend * getStringName(sample.candidates[j], canStr, mapi, mbat, 1); #if USE_SHORT_NAMES getPropNameShort(canStrShort, canStr); - if (j+1 == (int)sample.candidateCount) fprintf(foutrand, %s, canStrShort); - else fprintf(foutrand, %s|, canStrShort); + fprintf(foutrand, |%s\n, canStrShort); GDKfree(canStrShort); #else - if (j+1 == (int)sample.candidateCount) fprintf(foutrand, %s, canStr); - else fprintf(foutrand, %s|, canStr); + fprintf(foutrand, %s, canStr); #endif GDKfree(canStr); @@ -6896,11 +6895,10 @@ str printFullSampleData(CSSampleExtend * } } - fprintf(foutrand, \n); fprintf(foutsol, \n); // print origin of candidates for solutions file - fprintf(foutsol, New: %d, Type %d, Ontology %d, FK %d\n, sample.candidatesNew, sample.candidatesType, sample.candidatesOntology, sample.candidatesFK); + fprintf(foutsol, New %d, Type %d, Ontology %d, FK %d\n, sample.candidatesNew, sample.candidatesType, sample.candidatesOntology, sample.candidatesFK); if (sample.name != BUN_NONE){ str canStrShort = NULL; @@ -7136,7 +7134,6 @@ str printFullSampleData(CSSampleExtend * fprintf(fout, \n); fprintf(foutsol, \n); - fprintf(foutrand, \n); fprintf(foutis, \ tmp.txt \n \n); if (sample.name != BUN_NONE){ ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - fix bug in printing referenced table names
Changeset: cfb55d248d82 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cfb55d248d82 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: fix bug in printing referenced table names instead of the name of the referenced table, the name of the current table was printed diffs (26 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6582,12 +6582,11 @@ str printSampleData(CSSample *csSample, #if NO_OUTPUTFILE == 0 static void printPropertyWithMarkers(FILE *fout, str propStr, CSSampleExtend *csSampleEx, CSPropTypes *csPropTypes, int tblId, int propId, BATiter mapi, BAT *mbat) { - CSSampleExtend sample = csSampleEx[tblId]; // print property string fprintf(fout, %s, propStr); // add star (*) if multi-valued - if (sample.lstIsMVCol[propId]) { + if (csSampleEx[tblId].lstIsMVCol[propId]) { fprintf(fout, *); } @@ -6599,7 +6598,7 @@ void printPropertyWithMarkers(FILE *fout #if USE_SHORT_NAMES str nameStrShort; #endif - getStringName(sample.candidatesOrdered[0], nameStr, mapi, mbat, 1); + getStringName(csSampleEx[refTblId].candidatesOrdered[0], nameStr, mapi, mbat, 1); #if USE_SHORT_NAMES getPropNameShort(nameStrShort, nameStr); fprintf(fout, -%s, nameStrShort); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - remove language tags and quotation marks from mul...
Changeset: 22a192111e74 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=22a192111e74 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: remove language tags and quotation marks from multi-valued properties of type datetime and string diffs (34 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -5514,9 +5514,12 @@ int getObjValueFromMVBat(ValPtr returnVa //printf(A String object value: %s \n,objStr); tmpStr = BUNtail(tmpi, pos); if (strcmp(tmpStr,str_nil) != 0){ - inputStr = GDKmalloc(sizeof(char) * strlen(tmpStr) + 1); - memcpy(inputStr, tmpStr, sizeof(char) * strlen(tmpStr) + 1); - + // remove quotes and language tags + str tmpStrShort; + getStringBetweenQuotes(tmpStrShort, tmpStr); + inputStr = GDKmalloc(sizeof(char) * strlen(tmpStrShort) + 1); + memcpy(inputStr, tmpStrShort, sizeof(char) * strlen(tmpStrShort) + 1); + GDKfree(tmpStrShort); VALset(returnValue, TYPE_str, inputStr); if (rdfcast(objType, STRING, returnValue, castedValue) != 1){ printf(Everything should be able to cast to String \n); @@ -5532,8 +5535,12 @@ int getObjValueFromMVBat(ValPtr returnVa //printf(A Datetime object value: %s \n,objStr); tmpStr = BUNtail(tmpi, pos); if (strcmp(tmpStr,str_nil) != 0){ - inputStr = GDKmalloc(sizeof(char) * strlen(tmpStr) + 1); - memcpy(inputStr, tmpStr, sizeof(char) * strlen(tmpStr) + 1); + // remove quotes and language tags + str tmpStrShort; + getStringBetweenQuotes(tmpStrShort, tmpStr); + inputStr = GDKmalloc(sizeof(char) * strlen(tmpStrShort) + 1); + memcpy(inputStr, tmpStrShort, sizeof(char) * strlen(tmpStrShort) + 1); + GDKfree(tmpStrShort); VALset(returnValue, TYPE_str, inputStr); if (rdfcast(objType, STRING, returnValue, castedValue) != 1){ printf(Everything should be able to cast to String \n); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - if type properties are available, add them to the...
Changeset: f638ef061c44 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f638ef061c44 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: if type properties are available, add them to the survey data in any case diffs (44 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6381,7 +6381,7 @@ void printPropertyWithMarkers(FILE *fout #if NO_OUTPUTFILE == 0 // Compute property order and number of properties that are printed, and the list of remaining properties that is printed without sample data static -int* createPropertyOrder(int *numPropsInSampleTable, int **remainingProperties, CSset *freqCSset, CSSampleExtend *csSampleEx, int tblId, CSPropTypes *csPropTypes, PropStat *propStat) { +int* createPropertyOrder(int *numPropsInSampleTable, int **remainingProperties, CSset *freqCSset, CSSampleExtend *csSampleEx, int tblId, CSPropTypes *csPropTypes, PropStat *propStat, char* isTypeProp) { int i; CSSampleExtend sample; CSPropTypes csPropType; @@ -6466,11 +6466,21 @@ int* createPropertyOrder(int *numPropsIn } // now add properties to propOrder array + // add all type properties + for (i = 0; i sample.numProp; ++i) { + if (propsAdded = (*numPropsInSampleTable)) break; // enough properties found + if (isTypeProp[i]) { // do not use 'index' because the isTypeProp array uses the old order of properties + propOrder[propsAdded] = i; + isAdded[i] = 1; + propsAdded++; + } + } + // first round: properties with isFilled=1 and isTextDate=1, ordered by tfidfValues descending for (i = 0; i sample.numProp; ++i) { int index = propOrderTfidf[i]; if (propsAdded = (*numPropsInSampleTable)) break; // enough properties found - if (isFilled[index] isTextDate[index]) { + if (isFilled[index] isTextDate[index] !isAdded[index]) { // add propOrder[propsAdded] = index; isAdded[index] = 1; @@ -6702,7 +6712,7 @@ str printFullSampleData(CSSampleExtend * // order properties and get list of remaining properties that will be printed without sample data remainingProperties = NULL; numPropsInSampleTable = 0; - propOrder = createPropertyOrder(numPropsInSampleTable, remainingProperties, freqCSset, csSampleEx, i, csPropTypes, propStat); + propOrder = createPropertyOrder(numPropsInSampleTable, remainingProperties, freqCSset, csSampleEx, i, csPropTypes, propStat, isTypeProp); // print list of columns that did not make it to propOrder and are therefore printed without sample data if (sample.numProp numPropsInSampleTable) { ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - ensure same height of tables, necessary for layou...
Changeset: acc2cfbc3ba4 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=acc2cfbc3ba4 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: ensure same height of tables, necessary for layouting diffs (13 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6736,6 +6736,9 @@ str printFullSampleData(CSSampleExtend * GDKfree(propStr); } fprintf(fout, \n); + } else { + // we have to print an empty row to ensure that all tables have the same height, this simplifies the survey layouting in a spreadsheet programm + fprintf(fout, \n); } //List of columns ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - fix scope
Changeset: ad60eed4d99a for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ad60eed4d99a Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: fix scope diffs (19 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -9192,6 +9192,7 @@ RDFreorganize(int *ret, CStableStat *cst printf(Return value from RDFdistTriplesToCSs is %s \n, returnStr); if (returnStr != MAL_SUCCEED){ throw(RDF, rdf.RDFreorganize, Problem in distributing triples to BATs using CSs); + } curT = clock(); printf (RDFdistTriplesToCSs process took %f seconds.\n, ((float)(curT - tmpLastT))/CLOCKS_PER_SEC); @@ -9200,7 +9201,6 @@ RDFreorganize(int *ret, CStableStat *cst #if NO_OUTPUTFILE == 0 printFKMultiplicityFromCSPropTypes(csPropTypes, numTables, freqCSset, *freqThreshold); #endif - } #if NO_OUTPUTFILE == 0 { ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - set name origin (isType, isOntology, isFK) when u...
Changeset: d201cd7814d2 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d201cd7814d2 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: set name origin (isType, isOntology, isFK) when updating labels diffs (285 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -2928,7 +2928,7 @@ oid* mergeCandidates(int *candidatesCoun * If no MERGECS is created (subset-superset relation), mergeCSFreqId contains the Id of the superset class. * For S1 and S2, parameter 'name' is used to avoid recomputation of CS names */ -str updateLabel(int ruleNumber, CSset *freqCSset, CSlabel **labels, int newCS, int mergeCSFreqId, int freqCS1, int freqCS2, oid name, oid **ontmetadata, int ontmetadataCount, int *lstFreqId, int numIds){ +str updateLabel(int ruleNumber, CSset *freqCSset, CSlabel **labels, int newCS, int mergeCSFreqId, int freqCS1, int freqCS2, oid name, int isType, int isOntology, int isFK, oid **ontmetadata, int ontmetadataCount, int *lstFreqId, int numIds){ int i; int freqCS1Counter; CSlabel big, small; @@ -2945,6 +2945,12 @@ str updateLabel(int ruleNumber, CSset *f (void) lstFreqId; (void) numIds; + #if ! INFO_WHERE_NAME_FROM + (void) isType; + (void) isOntology; + (void) isFK; + #endif + if (newCS) { // realloc labels *labels = GDKrealloc(*labels, sizeof(CSlabel) * freqCSset-numCSadded); @@ -2979,6 +2985,11 @@ str updateLabel(int ruleNumber, CSset *f case S1: // was: (S1 or S2), now combined // use common name label-name = name; + #if INFO_WHERE_NAME_FROM + label-isType = isType; + label-isOntology = isOntology; + label-isFK = isFK; + #endif #if USE_MULTIWAY_MERGING (void)ontmetadata; @@ -2996,9 +3007,6 @@ str updateLabel(int ruleNumber, CSset *f label-candidatesOntology = candidatesOntology; label-candidatesFK = candidatesFK; removeDuplicatedCandidates(label); - if (label-name == BUN_NONE label-candidates[0] != BUN_NONE) { - label-name = label-candidates[0]; - } // hierarchy if ((*labels)[freqCS1].name == label-name) { @@ -3033,6 +3041,11 @@ str updateLabel(int ruleNumber, CSset *f case S2: // use common ancestor label-name = name; + #if INFO_WHERE_NAME_FROM + label-isType = isType; + label-isOntology = isOntology; + label-isFK = isFK; + #endif // candidates mergedCandidates = mergeCandidates(candidatesCount, candidatesNew, candidatesType, candidatesOntology, candidatesFK, (*labels)[freqCS1], (*labels)[freqCS2], label-name); @@ -3044,9 +3057,6 @@ str updateLabel(int ruleNumber, CSset *f label-candidatesOntology = candidatesOntology; label-candidatesFK = candidatesFK; removeDuplicatedCandidates(label); - if (label-name == BUN_NONE label-candidates[0] != BUN_NONE) { - label-name = label-candidates[0]; - } // hierarchy freqCS1Counter = (*labels)[freqCS1].hierarchyCount - 1; @@ -3080,8 +3090,14 @@ str updateLabel(int ruleNumber, CSset *f label-candidatesFK = candidatesFK; removeDuplicatedCandidates(label); if (label-name == BUN_NONE label-candidates[0] != BUN_NONE) { + // superCS had no name before, but subCS adds candidates label-name = label-candidates[0]; - } + #if INFO_WHERE_NAME_FROM + label-isType = (*labels)[freqCS2].isType; + label-isOntology = (*labels)[freqCS2].isOntology; + label-isFK = (*labels)[freqCS2].isFK; + #endif + } // else: old name and isType/isOntology/isFK remain valid // hierarchy already set // properties already set @@ -3113,6 +3129,11 @@ str updateLabel(int ruleNumber, CSset *f } // #endif label-name = big.name; + #if INFO_WHERE_NAME_FROM + label-isType = big.isType; + label-isOntology = big.isOntology; + label-isFK = big.isFK; + #endif // candidates mergedCandidates =
MonetDB: rdf - improve layout and data presentation of survey data
Changeset: 2f740b0aabd2 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2f740b0aabd2 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: improve layout and data presentation of survey data - remove quotes and language tags from strings - indicate multi-valued properties with a star * - indicate FK properties with a reference -ReferencedTableName - use only last part of URI for type property values diffs (165 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6190,8 +6190,43 @@ str printSampleData(CSSample *csSample, #endif #if NO_OUTPUTFILE == 0 +static +void printPropertyWithMarkers(FILE *fout, str propStr, CSSampleExtend *csSampleEx, CSPropTypes *csPropTypes, int tblId, int propId, BATiter mapi, BAT *mbat) { + // print property string + fprintf(fout, %s, propStr); + + // add star (*) if multi-valued + if (csSampleEx[tblId].lstIsMVCol[propId]) { + fprintf(fout, *); + } + + // add reference (-) if FK + if (csPropTypes[tblId].lstPropTypes[propId].isFKProp == 1) { + str nameStr; + int refTblId = csPropTypes[tblId].lstPropTypes[propId].refTblId; + if (csSampleEx[refTblId].candidatesOrdered[0] != BUN_NONE) { // table name (= best candidate) available +#if USE_SHORT_NAMES + str nameStrShort; +#endif + getStringName(csSampleEx[tblId].candidatesOrdered[0], nameStr, mapi, mbat, 1); +#if USE_SHORT_NAMES + getPropNameShort(nameStrShort, nameStr); + fprintf(fout, -%s, nameStrShort); + GDKfree(nameStrShort); +#else + fprintf(fout, -%s, nameStr); +#endif + GDKfree(nameStr); + } else { // no table name + fprintf(fout, -Table%d, refTblId); + } + } +} +#endif + +#if NO_OUTPUTFILE == 0 static -str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, PropStat *propStat, CSset *freqCSset){ +str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, PropStat *propStat, CSset *freqCSset, CSPropTypes *csPropTypes){ int i,j, k; FILE*fout, *foutrand, *foutsol, *fouttb, *foutis; @@ -6230,13 +6265,24 @@ str printFullSampleData(CSSampleExtend * int found = 0; CS freqCS; - - mapi = bat_iterator(mbat); + oid *typeAttributesOids; + char*isTypeProp; // 1 if property is in typeAttributes[] + if (TKNZRopen (NULL, schema) != MAL_SUCCEED) { throw(RDF, rdf.rdfschema, could not open the tokenizer\n); } + // get oids for typeAttributes[] + typeAttributesOids = GDKmalloc(sizeof(oid) * typeAttributesCount); + if (!typeAttributesOids){ + fprintf(stderr, ERROR: Couldn't malloc memory!\n); + } + for (i = 0; i typeAttributesCount; ++i) { + TKNZRappend(typeAttributesOids[i], typeAttributes[i]); + } + + mapi = bat_iterator(mbat); strcpy(filename, sampleDataFull); strcat(filename, .txt); @@ -6336,6 +6382,24 @@ str printFullSampleData(CSSampleExtend * else fprintf(fouttb,CREATE TABLE tbSample%d \n (\n, i); + // mark type columns, because their sample data is represented without ... + isTypeProp = GDKmalloc(sizeof(char) * sample.numProp); + if (!isTypeProp){ + fprintf(stderr, ERROR: Couldn't malloc memory!\n); + } + for (j = 0; j sample.numProp; ++j) { + isTypeProp[j] = 0; + } + for (j = 0; j sample.numProp; ++j) { + for (k = 0; k typeAttributesCount; ++k) { + if (sample.lstProp[j] == typeAttributesOids[k]) { + // found a type property + isTypeProp[j] = 1; + break; + } + } + } + // Compute property order (descending by support) and number of properties that are printed found = 0; numPropsInSampleTable = (sample.numProp(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp; @@ -6424,7 +6488,8 @@ str printFullSampleData(CSSampleExtend * takeOid(sample.lstProp[index], propStr); #if USE_SHORT_NAMES getPropNameShort(propStrShort, propStr); - fprintf(fout,|%s, propStrShort); +
MonetDB: rdf - change selection of properties shown to the user
Changeset: 38f4907254da for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=38f4907254da Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: change selection of properties shown to the user - choose discriminating, filled, text/datetime properties first; then discriminating, filled properties; then discriminating properties - add all omitted properties to the output file without showing sample data - remove reordering of props from printSampleData() because it is not used for generating survey data (instead, printFullSampleData() is used) diffs (truncated from 527 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -5815,7 +5815,7 @@ void getTblName(str *name, oid nameId, B #if NO_OUTPUTFILE == 0 static -str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, int sampleVersion, PropStat *propStat){ +str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){ int i,j, k; FILE*fout, *fouttb, *foutis; @@ -5828,9 +5828,6 @@ str printSampleData(CSSample *csSample, char* schema = rdf; CSSamplesample; CS freqCS; - int*propOrder; - int*propOrderTfidf; - float* tfidfValues; int numPropsInSampleTable; charobjType = 0; str objStr; @@ -5935,82 +5932,6 @@ str printSampleData(CSSample *csSample, //Number of tuples fprintf(fout, %d\n, freqCS.support); - // Compute property order (descending by support) and number of properties that are printed - if (sampleVersion 1) { - int found = 0; - numPropsInSampleTable = (sample.numProp(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp; - propOrder = GDKmalloc(sizeof(int) * sample.numProp); - propOrderTfidf = GDKmalloc(sizeof(int) * sample.numProp); - tfidfValues = GDKmalloc(sizeof(float) * sample.numProp); - for (j = 0; j sample.numProp; ++j) { - propOrder[j] = j; - propOrderTfidf[j] = j; - } - - // To get the top NUM_PROP_SUPPORT_SAMPLE properties, sort all properties descending by support. - // The subject column remains at the first position regardless of its support. - // Sort using insertion sort. - for (j = 2; j sample.numProp; ++j) { - int tmpPos = propOrder[j]; - int tmpVal = freqCS.lstPropSupport[tmpPos]; - int k = j - 1; - while (k = 1 freqCS.lstPropSupport[propOrder[k]] tmpVal) { // sort descending - propOrder[k + 1] = propOrder[k]; - k--; - } - propOrder[k + 1] = tmpPos; - } - - // To get the top NUM_PROP_TFIDF_SAMPLE properties, sort all properties descending by tf-idf score. - for (j = 1; j sample.numProp; ++j) { - float tfidf; - BUN bun = BUNfnd(BATmirror(propStat-pBat),(ptr) sample.lstProp[j]); - if (bun == BUN_NONE) { - printf(Error: property not found\n); - } else { - tfidf = propStat-tfidfs[bun]; - } - tfidfValues[j] = tfidf; - } - - // Sort using insertion sort. Ignore subject column - for (j = 2; j sample.numProp; ++j) { - int tmpPos = propOrderTfidf[j]; - float tmpVal = tfidfValues[tmpPos]; - int k = j - 1; - while (k = 1 tfidfValues[propOrderTfidf[k]] tmpVal) { // sort descending - propOrderTfidf[k + 1] = propOrderTfidf[k]; - k--; - } - propOrderTfidf[k + 1] = tmpPos; - } - - // Add NUM_PROP_TFIDF_SAMPLE properties to propOrder that have a high tfidf score but are not yet in the top 1+NUM_PROP_TFIDF_SAMPLE values of propOrder - for (j = 1; j
MonetDB: rdf - change FullSampleData to print only 8 columns and...
Changeset: 782ccaa7dff9 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=782ccaa7dff9 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: change FullSampleData to print only 8 columns and add a file that contains the solutions (ordered candidates) diffs (truncated from 369 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -5507,9 +5507,15 @@ str initFullSampleData(CSSampleExtend *c csSampleEx[i].name = cstablestat-lstcstable[i].tblname; csSampleEx[i].candidateCount = tmpNumcand; csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * tmpNumcand); + csSampleEx[i].candidatesOrdered = (oid*)malloc(sizeof(oid) * tmpNumcand); for (k = 0; k tmpNumcand; k++){ csSampleEx[i].candidates[k] = label[freqId].candidates[k]; - } + csSampleEx[i].candidatesOrdered[k] = label[freqId].candidates[k]; + } + csSampleEx[i].candidatesNew = label[freqId].candidatesNew; + csSampleEx[i].candidatesOntology = label[freqId].candidatesOntology; + csSampleEx[i].candidatesType = label[freqId].candidatesType; + csSampleEx[i].candidatesFK = label[freqId].candidatesFK; //Randomly exchange the value, change the position k with a random pos for (k = 0; k tmpNumcand; k++){ randValue = rand() % tmpNumcand; @@ -5650,6 +5656,7 @@ void freeSampleExData(CSSampleExtend *cs free(csSampleEx[i].lstIsInfrequentProp); free(csSampleEx[i].lstIsMVCol); free(csSampleEx[i].candidates); + free(csSampleEx[i].candidatesOrdered); free(csSampleEx[i].lstSubjOid); for (j = 0; j csSampleEx[i].numProp; j++){ BBPunfix(csSampleEx[i].colBats[j]-batCacheid); @@ -6184,11 +6191,11 @@ str printSampleData(CSSample *csSample, #if NO_OUTPUTFILE == 0 static -str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat){ +str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, PropStat *propStat, CSset *freqCSset){ int i,j, k; - FILE*fout, *fouttb, *foutis; - charfilename[100], filename2[100], filename3[100]; + FILE*fout, *foutsol, *fouttb, *foutis; + charfilename[100], filename4[100], filename2[100], filename3[100]; int ret; str propStr; @@ -6216,6 +6223,12 @@ str printFullSampleData(CSSampleExtend * str propStrShort = NULL; char*pch; #endif + int*propOrder; + int*propOrderTfidf; + float* tfidfValues; + int numPropsInSampleTable; + int found = 0; + CS freqCS; mapi = bat_iterator(mbat); @@ -6227,6 +6240,9 @@ str printFullSampleData(CSSampleExtend * strcpy(filename, sampleDataFull); strcat(filename, .txt); + + strcpy(filename4, sampleDataFullSolution); + strcat(filename4, .txt); strcpy(filename2, createSampleTableFull); strcat(filename2, .sh); @@ -6235,12 +6251,15 @@ str printFullSampleData(CSSampleExtend * strcat(filename3, .sh); fout = fopen(filename,wt); + foutsol = fopen(filename4,wt); fouttb = fopen(filename2,wt); foutis = fopen(filename3,wt); for (i = 0; i num; i++){ sample = csSampleEx[i]; - fprintf(fout,Sample table %d Candidates: , i); + freqCS = freqCSset-items[sample.freqIdx]; + fprintf(fout,Table %d\n, i); + fprintf(foutsol, Table %d\n, i); for (j = 0; j (int)sample.candidateCount; j++){ //fprintf(fout,BUNFMT,sample.candidates[j]); if (sample.candidates[j] != BUN_NONE){ @@ -6251,18 +6270,43 @@ str printFullSampleData(CSSampleExtend * getStringName(sample.candidates[j], canStr, mapi, mbat, 1); #if USE_SHORT_NAMES getPropNameShort(canStrShort, canStr); - fprintf(fout,;%s, canStrShort); + if (j+1 == (int)sample.candidateCount) fprintf(fout, %s, canStrShort); + else fprintf(fout, %s;, canStrShort); GDKfree(canStrShort); #else - fprintf(fout,;%s, canStr); + if (j+1 == (int)sample.candidateCount) fprintf(fout, %s, canStr); + else fprintf(fout, %s;, canStr); + #endif GDKfree(canStr);
MonetDB: rdf - change delimiter in sample data, split into two f...
Changeset: df9f9c031311 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=df9f9c031311 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: change delimiter in sample data, split into two files (instances + candidates) diffs (162 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6196,7 +6196,7 @@ static str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, PropStat *propStat, CSset *freqCSset){ int i,j, k; - FILE*fout, *foutsol, *fouttb, *foutis; + FILE*fout, *foutrand, *foutsol, *fouttb, *foutis; charfilename[100], filename4[100], filename2[100], filename3[100]; int ret; @@ -6254,13 +6254,15 @@ str printFullSampleData(CSSampleExtend * fout = fopen(filename,wt); foutsol = fopen(filename4,wt); + foutrand = fopen(sampleDataFullRandom.txt,wt); fouttb = fopen(filename2,wt); foutis = fopen(filename3,wt); for (i = 0; i num; i++){ sample = csSampleEx[i]; freqCS = freqCSset-items[sample.freqIdx]; - fprintf(fout,Table %d\n, i); + fprintf(fout,Table %d, %d tuples\n, i, freqCS.support); + fprintf(foutrand,Table %d, %d tuples\n, i, freqCS.support); fprintf(foutsol, Table %d\n, i); for (j = 0; j (int)sample.candidateCount; j++){ //fprintf(fout,BUNFMT,sample.candidates[j]); @@ -6272,12 +6274,12 @@ str printFullSampleData(CSSampleExtend * getStringName(sample.candidates[j], canStr, mapi, mbat, 1); #if USE_SHORT_NAMES getPropNameShort(canStrShort, canStr); - if (j+1 == (int)sample.candidateCount) fprintf(fout, %s, canStrShort); - else fprintf(fout, %s;, canStrShort); + if (j+1 == (int)sample.candidateCount) fprintf(foutrand, %s, canStrShort); + else fprintf(foutrand, %s|, canStrShort); GDKfree(canStrShort); #else - if (j+1 == (int)sample.candidateCount) fprintf(fout, %s, canStr); - else fprintf(fout, %s;, canStr); + if (j+1 == (int)sample.candidateCount) fprintf(foutrand, %s, canStr); + else fprintf(foutrand, %s|, canStr); #endif GDKfree(canStr); @@ -6292,18 +6294,18 @@ str printFullSampleData(CSSampleExtend * #if USE_SHORT_NAMES getPropNameShort(canStrShort, canStr); if (j+1 == (int)sample.candidateCount) fprintf(foutsol, %s (%s), canStrShort, canStr); - else fprintf(foutsol, %s (%s);, canStrShort, canStr); + else fprintf(foutsol, %s (%s)|, canStrShort, canStr); GDKfree(canStrShort); #else if (j+1 == (int)sample.candidateCount) fprintf(foutsol, %s, canStr); - else fprintf(foutsol, %s;, canStr); + else fprintf(foutsol, %s|, canStr); #endif GDKfree(canStr); } } - fprintf(fout, \n); + fprintf(foutrand, \n); fprintf(foutsol, \n); // print origin of candidates for solutions file @@ -6335,9 +6337,6 @@ str printFullSampleData(CSSampleExtend * else fprintf(fouttb,CREATE TABLE tbSample%d \n (\n, i); - //Number of tuples - fprintf(fout, %d\n, freqCS.support); - // Compute property order (descending by support) and number of properties that are printed found = 0; numPropsInSampleTable = (sample.numProp(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp; @@ -6426,7 +6425,7 @@ str printFullSampleData(CSSampleExtend * takeOid(sample.lstProp[index], propStr); #if USE_SHORT_NAMES getPropNameShort(propStrShort, propStr); - fprintf(fout,;%s, propStrShort); + fprintf(fout,|%s, propStrShort); pch = strstr (propStrShort,-); if (pch != NULL) *pch = '\0'; //Remove - characters from prop //WEBCRAWL specific problem @@ -6507,14 +6506,14 @@ str printFullSampleData(CSSampleExtend * } else{
MonetDB: rdf - fix delimiter in sample data, fix number of candi...
Changeset: f69ebb7d9f55 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f69ebb7d9f55 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: fix delimiter in sample data, fix number of candidates that are printed diffs (24 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6501,7 +6501,7 @@ str printFullSampleData(CSSampleExtend * if (tmpBat-ttype == TYPE_oid){ //URI or BLANK NODE or MVCol objOid = (oid *) BUNtail(tmpi, k); if (*objOid == oid_nil){ - fprintf(fout,;NULL); + fprintf(fout,|NULL); fprintf(foutis,|NULL); } else{ diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -403,7 +403,7 @@ typedef struct CSPropTypes { #define NUM_SAMPLETABLE 20 #defineNUM_SAMPLE_INSTANCE 10 -#define NUM_SAMPLE_CANDIDATE 999 // print all candidates +#define NUM_SAMPLE_CANDIDATE 9 #define SAMPLE_FILTER_THRESHOLD 10 // SAMPLE_FILTER_THRESHOLD/ 100 #define GETSAMPLE_BEFOREMERGING 1 // Get the sample data before merging CS's #define NUM_PROP_SUPPORT_SAMPLE 5 // how many properties should be added to the sample data because of a high support (excluding subject column) ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - do not add tables without candidates to sample data
Changeset: 162b64fd0507 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=162b64fd0507 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: do not add tables without candidates to sample data diffs (11 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6260,6 +6260,7 @@ str printFullSampleData(CSSampleExtend * for (i = 0; i num; i++){ sample = csSampleEx[i]; + if ((int)sample.candidateCount == 1 sample.candidates[0] == BUN_NONE) continue; // do not print tables withoud candidates freqCS = freqCSset-items[sample.freqIdx]; fprintf(fout,Table %d, %d tuples\n, i, freqCS.support); fprintf(foutrand,Table %d, %d tuples\n, i, freqCS.support); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - use full URIs in sample data
Changeset: 1ac4a7056475 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1ac4a7056475 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: use full URIs in sample data diffs (41 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -6024,7 +6024,7 @@ str printSampleData(CSSample *csSample, isSite = 0; for (j = 0; j numPropsInSampleTable; j++){ int index = j; - if (sampleVersion 1){ //Do not consider infreq Prop + if (sampleVersion 1){ index = propOrder[index]; // apply mapping to change order of properties } #if USE_SHORT_NAMES @@ -6103,7 +6103,7 @@ str printSampleData(CSSample *csSample, for (j = 0; j numPropsInSampleTable; j++){ int index = j; - if (sampleVersion 1){ //Do not consider infreq Prop + if (sampleVersion 1){ index = propOrder[index]; // apply mapping to change order of properties } objOid = sample.lstObj[index][k]; @@ -6480,18 +6480,9 @@ str printFullSampleData(CSSampleExtend * fprintf(foutis, echo \); //All the instances for (k = 0; k sample.numInstances; k++){ -#if USE_SHORT_NAMES - str subjStrShort = NULL; -#endif takeOid(sample.lstSubjOid[k], subjStr); -#if USE_SHORT_NAMES - getPropNameShort(subjStrShort, subjStr); - fprintf(fout,%s, subjStrShort); - fprintf(foutis,%s, subjStrShort); - GDKfree(subjStrShort); -#else + fprintf(foutis,%s, subjStr); fprintf(fout,%s, subjStr); -#endif GDKfree(subjStr); for (j = 0; j numPropsInSampleTable; j++){ ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - add properties with high tfidf scores to sample data
Changeset: b443cd8459e9 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b443cd8459e9 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: add properties with high tfidf scores to sample data diffs (170 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -5763,7 +5763,7 @@ void getTblName(str *name, oid nameId, B #if NO_OUTPUTFILE == 0 static -str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){ +str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, int sampleVersion, PropStat *propStat){ int i,j, k; FILE*fout, *fouttb, *foutis; @@ -5777,6 +5777,8 @@ str printSampleData(CSSample *csSample, CSSamplesample; CS freqCS; int*propOrder; + int*propOrderTfidf; + float* tfidfValues; int numPropsInSampleTable; charobjType = 0; str objStr; @@ -5883,14 +5885,19 @@ str printSampleData(CSSample *csSample, // Compute property order (descending by support) and number of properties that are printed if (sampleVersion 1) { - numPropsInSampleTable = (sample.numPropNUM_PROPS_IN_SAMPLE_DATA)?NUM_PROPS_IN_SAMPLE_DATA:sample.numProp; + int found = 0; + numPropsInSampleTable = (sample.numProp(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp; propOrder = GDKmalloc(sizeof(int) * sample.numProp); + propOrderTfidf = GDKmalloc(sizeof(int) * sample.numProp); + tfidfValues = GDKmalloc(sizeof(float) * sample.numProp); for (j = 0; j sample.numProp; ++j) { propOrder[j] = j; - } - - // insertion sort - // do not sort Subject (first property), it should remain at the first position + propOrderTfidf[j] = j; + } + + // To get the top NUM_PROP_SUPPORT_SAMPLE properties, sort all properties descending by support. + // The subject column remains at the first position regardless of its support. + // Sort using insertion sort. for (j = 2; j sample.numProp; ++j) { int tmpPos = propOrder[j]; int tmpVal = freqCS.lstPropSupport[tmpPos]; @@ -5902,6 +5909,51 @@ str printSampleData(CSSample *csSample, propOrder[k + 1] = tmpPos; } + // To get the top NUM_PROP_TFIDF_SAMPLE properties, sort all properties descending by tf-idf score. + for (j = 1; j sample.numProp; ++j) { + float tfidf; + BUN bun = BUNfnd(BATmirror(propStat-pBat),(ptr) sample.lstProp[j]); + if (bun == BUN_NONE) { + printf(Error: property not found\n); + } else { + tfidf = propStat-tfidfs[bun]; + } + tfidfValues[j] = tfidf; + } + + // Sort using insertion sort. Ignore subject column + for (j = 2; j sample.numProp; ++j) { + int tmpPos = propOrderTfidf[j]; + float tmpVal = tfidfValues[tmpPos]; + int k = j - 1; + while (k = 1 tfidfValues[propOrderTfidf[k]] tmpVal) { // sort descending + propOrderTfidf[k + 1] = propOrderTfidf[k]; + k--; + } + propOrderTfidf[k + 1] = tmpPos; + } + + // Add NUM_PROP_TFIDF_SAMPLE properties to propOrder that have a high tfidf score but are not yet in the top 1+NUM_PROP_TFIDF_SAMPLE values of propOrder + for (j = 1; j sample.numProp; ++j) { + int prop, foundProp, bound; + if (found == NUM_PROP_TFIDF_SAMPLE) break; + prop = propOrderTfidf[j]; + // check if prop is already choosen + foundProp = 0; + bound =
MonetDB: rdf - fix number of properties printed as sample data
Changeset: ff5c66817286 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ff5c66817286 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: fix number of properties printed as sample data diffs (12 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -5995,7 +5995,7 @@ str printSampleData(CSSample *csSample, #endif GDKfree(subjStr); - for (j = 0; j sample.numProp; j++){ + for (j = 0; j numPropsInSampleTable; j++){ int index = j; if (sampleVersion 1){ //Do not consider infreq Prop index = propOrder[index]; // apply mapping to change order of properties ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - change sampleData generation to generate only 8 c...
Changeset: c78661b8a206 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c78661b8a206 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: change sampleData generation to generate only 8 columns (ordered by support) diffs (169 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -761,14 +761,6 @@ char isInfrequentProp(PropTypes pt, CS c #if NO_OUTPUTFILE == 0 static -char isInfrequentSampleProp(CS freqCS, int propIdx){ - if (freqCS.lstPropSupport[propIdx] * 100 freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; - else return 0; -} -#endif - -#if NO_OUTPUTFILE == 0 -static char isInfrequentSampleCol(CS freqCS, PropTypes pt){ if (pt.propFreq * 100 freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1; else return 0; @@ -5784,6 +5776,8 @@ str printSampleData(CSSample *csSample, char* schema = rdf; CSSamplesample; CS freqCS; + int*propOrder; + int numPropsInSampleTable; charobjType = 0; str objStr; oid objOid = BUN_NONE; @@ -5833,7 +5827,7 @@ str printSampleData(CSSample *csSample, for (i = 0; i num; i++){ sample = csSample[i]; freqCS = freqCSset-items[sample.freqIdx]; - fprintf(fout,Sample table %d Candidates: , i); + fprintf(fout,Table %d\n, i); for (j = 0; j (int)sample.candidateCount; j++){ //fprintf(fout,BUNFMT,sample.candidates[j]); if (sample.candidates[j] != BUN_NONE){ @@ -5844,10 +5838,12 @@ str printSampleData(CSSample *csSample, getStringName(sample.candidates[j], canStr, mapi, mbat, 1); #if USE_SHORT_NAMES getPropNameShort(canStrShort, canStr); - fprintf(fout,;%s, canStrShort); + if (j+1 == (int)sample.candidateCount) fprintf(fout, %s, canStrShort); + else fprintf(fout, %s;, canStrShort); GDKfree(canStrShort); #else - fprintf(fout,;%s, canStr); + if (j+1 == (int)sample.candidateCount) fprintf(fout, %s, canStr); + else fprintf(fout, %s;, canStr); #endif GDKfree(canStr); @@ -5882,6 +5878,35 @@ str printSampleData(CSSample *csSample, else fprintf(fouttb,CREATE TABLE tbSample%d \n (\n, i); + //Number of tuples + fprintf(fout, %d\n, freqCS.support); + + // Compute property order (descending by support) and number of properties that are printed + if (sampleVersion 1) { + numPropsInSampleTable = (sample.numPropNUM_PROPS_IN_SAMPLE_DATA)?NUM_PROPS_IN_SAMPLE_DATA:sample.numProp; + propOrder = GDKmalloc(sizeof(int) * sample.numProp); + for (j = 0; j sample.numProp; ++j) { + propOrder[j] = j; + } + + // insertion sort + // do not sort Subject (first property), it should remain at the first position + for (j = 2; j sample.numProp; ++j) { + int tmpPos = propOrder[j]; + int tmpVal = freqCS.lstPropSupport[tmpPos]; + int k = j - 1; + while (k = 1 freqCS.lstPropSupport[propOrder[k]] tmpVal) { // sort descending + propOrder[k + 1] = propOrder[k]; + k--; + } + propOrder[k + 1] = tmpPos; + } + + } else { + numPropsInSampleTable = sample.numProp; // all properties, no change in order because freqCS.lstPropSupport[] is not yet available + } + + //List of columns fprintf(fout,Subject); fprintf(fouttb,SubjectCol string); @@ -5891,14 +5916,15 @@ str printSampleData(CSSample *csSample, isDescription = 0; isImage = 0; isSite = 0; - for (j = 0; j sample.numProp; j++){ + for (j = 0; j numPropsInSampleTable; j++){ + int index = j; if (sampleVersion 1){ //Do not consider infreq Prop - if (isInfrequentSampleProp(freqCS, j)) continue; +
MonetDB: rdf - use all candidates for survey
Changeset: 30047a755a5c for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=30047a755a5c Modified Files: monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: use all candidates for survey diffs (12 lines): diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -396,7 +396,7 @@ typedef struct CSPropTypes { #define NUM_SAMPLETABLE 20 #defineNUM_SAMPLE_INSTANCE 10 -#define NUM_SAMPLE_CANDIDATE 3 +#define NUM_SAMPLE_CANDIDATE 999 // print all candidates #define SAMPLE_FILTER_THRESHOLD 10 // SAMPLE_FILTER_THRESHOLD/ 100 #define GETSAMPLE_BEFOREMERGING 1 // Get the sample data before merging CS's #define NUM_PROPS_IN_SAMPLE_DATA 8 // how many properties should be printed (including subject column) ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - fix baseline for normalizing tf-idf scores
Changeset: 9db5008798de for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9db5008798de Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: fix baseline for normalizing tf-idf scores diffs (30 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1413,7 +1413,7 @@ void createPropStatistics(PropStat* prop */ //[DUC] Create propstat for ontology only static -void createPropStatistics(PropStat* propStat, oid** ontattributes, int ontattributesCount) { +void createPropStatistics(PropStat* propStat, oid** ontattributes, int ontattributesCount, int ontmetadataCount) { int i; int numProps = 0; @@ -1445,7 +1445,7 @@ void createPropStatistics(PropStat* prop } for (i = 0; i propStat-numAdded; ++i) { - propStat-tfidfs[i] = log(((float)numProps) / (1 + propStat-freqs[i])); + propStat-tfidfs[i] = log(((float)ontmetadataCount) / (1 + propStat-freqs[i])); } } @@ -1475,7 +1475,7 @@ void createOntologyLookupResult(oid** re //[DUC] Change the function for getting propStat. Use ontattributes for the propStat. // Not the properties from freqCS //createPropStatistics(propStat, freqCSset-numCSadded, freqCSset); - createPropStatistics(propStat, ontattributes, ontattributesCount); + createPropStatistics(propStat, ontattributes, ontattributesCount, ontmetadataCount); for (i = 0; i freqCSset-numCSadded; ++i) { CS cs; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - fix baseline for normalizing tf-idf scores
Changeset: a6392de1b2d0 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a6392de1b2d0 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: fix baseline for normalizing tf-idf scores diffs (27 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1385,12 +1385,14 @@ void createPropStatistics(PropStat* prop static void createPropStatistics(PropStat* propStat, oid** ontattributes, int ontattributesCount) { int i; + int numProps = 0; for (i = 0; i ontattributesCount; ++i) { oid attr = ontattributes[1][i]; // add prop to propStat BUN bun = BUNfnd(BATmirror(propStat-pBat), (ptr) attr); if (bun == BUN_NONE) { + numProps++; if (propStat-pBat-T-hash BATcount(propStat-pBat) 4 * propStat-pBat-T-hash-mask) { HASHdestroy(propStat-pBat); BAThash(BATmirror(propStat-pBat), 2*BATcount(propStat-pBat)); @@ -1413,7 +1415,7 @@ void createPropStatistics(PropStat* prop } for (i = 0; i propStat-numAdded; ++i) { - propStat-tfidfs[i] = log(((float)ontattributesCount) / (1 + propStat-freqs[i])); + propStat-tfidfs[i] = log(((float)numProps) / (1 + propStat-freqs[i])); } } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - simplify computation of ontology-based label beca...
Changeset: 3967b6658444 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3967b6658444 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: simplify computation of ontology-based label because of changed order of data sources when assigning labels diffs (94 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -2076,7 +2076,7 @@ void removeDuplicatedCandidates(CSlabel /* For one CS: Choose the best table name out of all collected candidates (ontology, type, fk). */ static void getTableName(CSlabel* label, int csIdx, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, TypeStat* typeStat, int typeStatCount, oid** result, int* resultCount, IncidentFKs* links, oid** ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet) { - int i, j, k; + int i, j; oid *tmpList; int tmpListCount; charnameFound = 0; @@ -2243,9 +2243,8 @@ void getTableName(CSlabel* label, int cs label-candidatesCount += resultCount[csIdx]; } - // one ontology class -- use it - if (!nameFound){ - if (resultCount[csIdx] == 1) { + // chose first ontology candidate as label + if (!nameFound resultCount[csIdx] = 1){ label-name = result[csIdx][0]; label-hierarchy = getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, ontmetadataCount); nameFound = 1; @@ -2253,69 +2252,6 @@ void getTableName(CSlabel* label, int cs label-isOntology = 1; #endif } - } - - if (!nameFound) { - // multiple ontology classes -- intersect with types - if (resultCount[csIdx] 1) { - tmpList = NULL; - tmpListCount = 0; - // search for type values - for (i = 0; i typeAttributesCount; ++i) { - for (j = 0; j typeAttributesHistogramCount[csIdx][i]; ++j) { - if (typeAttributesHistogram[csIdx][i][j].percent TYPE_FREQ_THRESHOLD) break; // sorted - - // intersect type with ontology classes - for (k = 0; k resultCount[csIdx]; ++k) { - if (result[csIdx][k] == typeAttributesHistogram[csIdx][i][j].value) { - // found, copy ontology class to tmpList - tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount + 1)); - if (!tmpList) fprintf(stderr, ERROR: Couldn't realloc memory!\n); - tmpList[tmpListCount] = result[csIdx][k]; - tmpListCount += 1; - } - } - } - } - - // only one left -- use it - if (tmpListCount == 1) { - label-name = tmpList[0]; - label-hierarchy = getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, ontmetadataCount); - free(tmpList); - nameFound = 1; - #if INFO_WHERE_NAME_FROM - label-isOntology = 1; - #endif - } - - if (!nameFound) { - // multiple left -- use the class that covers most attributes, most popular ontology, ... - if (tmpListCount 1) { - label-name = tmpList[0]; // sorted - label-hierarchy = getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, ontmetadataCount); - free(tmpList); - nameFound = 1; - - #if INFO_WHERE_NAME_FROM - label-isOntology = 1; - #endif - } - } - - if (!nameFound) { - // empty intersection - use the class that covers most attributes, most popular ontology, .. - label-name = result[csIdx][0]; // sorted
MonetDB: rdf - fix compile errors when USE_MULTIWAY_MERGING is s...
Changeset: 608c23981c16 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=608c23981c16 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: fix compile errors when USE_MULTIWAY_MERGING is set to 1, fix mergeCandidates after the order of data types has been changed to (type - onto - fk) diffs (281 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -2016,7 +2016,7 @@ oid* getOntoHierarchy(oid ontology, int* static void removeDuplicatedCandidates(CSlabel *label) { int i, j; - int cNew = label-candidatesNew, cOnto = label-candidatesOntology, cType = label-candidatesType, cFK = label-candidatesFK; + int cNew = label-candidatesNew, cType = label-candidatesType, cOnto = label-candidatesOntology, cFK = label-candidatesFK; if (label-candidatesCount 2) return; // no duplicates @@ -2028,8 +2028,8 @@ void removeDuplicatedCandidates(CSlabel // find out which category (new, onto, type, fk) we are in int *cPtr = NULL; if (j label-candidatesNew) cPtr = cNew; - else if (j label-candidatesNew + label-candidatesOntology) cPtr = cOnto; - else if (j label-candidatesNew + label-candidatesOntology + label-candidatesType) cPtr = cType; + else if (j label-candidatesNew + label-candidatesType) cPtr = cType; + else if (j label-candidatesNew + label-candidatesType + label-candidatesOntology) cPtr = cOnto; else cPtr = cFK; if (label-candidates[i] == label-candidates[j] || label-candidates[j] == BUN_NONE) { @@ -2047,8 +2047,8 @@ void removeDuplicatedCandidates(CSlabel // update counts label-candidatesCount -= moveLeft; label-candidatesNew = cNew; + label-candidatesType = cType; label-candidatesOntology = cOnto; - label-candidatesType = cType; label-candidatesFK = cFK; } @@ -2062,10 +2062,10 @@ void removeDuplicatedCandidates(CSlabel // update value in category; if (label-candidatesNew 0) { label-candidatesNew--; + } else if (label-candidatesType 0) { + label-candidatesType--; } else if (label-candidatesOntology 0) { label-candidatesOntology--; - } else if (label-candidatesType 0) { - label-candidatesType--; } else { label-candidatesFK--; } @@ -2334,8 +2334,8 @@ CSlabel* initLabels(CSset *freqCSset) { labels[i].candidates = NULL; labels[i].candidatesCount = 0; labels[i].candidatesNew = 0; + labels[i].candidatesType = 0; labels[i].candidatesOntology = 0; - labels[i].candidatesType = 0; labels[i].candidatesFK = 0; labels[i].hierarchy = NULL; labels[i].hierarchyCount = 0; @@ -2790,7 +2790,7 @@ CSlabel* createLabels(CSset* freqCSset, * Result: common name ontology candidates CS1 ontology candidates CS2 type candidates CS1 type candidates CS2 FK candidates CS1 FK candidates CS2 */ static -oid* mergeCandidates(int *candidatesCount, int *candidatesNew, int *candidatesOntology, int *candidatesType, int *candidatesFK, CSlabel cs1, CSlabel cs2, oid commonName) { +oid* mergeCandidates(int *candidatesCount, int *candidatesNew, int *candidatesType, int *candidatesOntology, int *candidatesFK, CSlabel cs1, CSlabel cs2, oid commonName) { oid *candidates; int counter = 0; int i; @@ -2812,38 +2812,38 @@ oid* mergeCandidates(int *candidatesCoun } (*candidatesNew) = counter; - // copy ontology - for (i = 0; i cs1.candidatesOntology; ++i) { + // copy type + for (i = 0; i cs1.candidatesType; ++i) { candidates[counter] = cs1.candidates[cs1.candidatesNew + i]; counter++; } - for (i = 0; i cs2.candidatesOntology; ++i) { + for (i = 0; i cs2.candidatesType; ++i) { candidates[counter] = cs2.candidates[cs2.candidatesNew + i]; counter++; } - (*candidatesOntology) = counter - (*candidatesNew); - - // copy type - for (i = 0; i cs1.candidatesType; ++i) { - candidates[counter] = cs1.candidates[cs1.candidatesNew + cs1.candidatesOntology + i]; + (*candidatesType) = counter - (*candidatesNew); + + // copy ontology + for (i = 0; i cs1.candidatesOntology; ++i) { + candidates[counter] =
MonetDB: rdf - fix creation of typeAttributesHistogram
Changeset: 6832434de2c2 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6832434de2c2 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: fix creation of typeAttributesHistogram type values were assigned to the wrong CS because the new csFreqIdx was used instead of the old one diffs (33 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -967,9 +967,7 @@ void createTypeAttributesHistogram(BAT * // check if property (*pbt) is a type for (i = 0; i typeAttributesCount; ++i) { if (*pbt == typeAttributesOids[i]) { - // prop is a type! - csFreqIdx = csIdFreqIdxMap[subjCSMap[*sbt]]; // get object obt = (oid *) BUNtloc(oi, p); @@ -988,6 +986,7 @@ void createTypeAttributesHistogram(BAT * // nothing to add to histogram } else { // analyze values and add to histogram + csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); typeValuesSize = 0; // reset } @@ -1008,7 +1007,10 @@ void createTypeAttributesHistogram(BAT * } // analyze and add last set of typeValues - if (curS != BUN_NONE typeValuesSize != 0) insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + if (curS != BUN_NONE typeValuesSize != 0) { + csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject + insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, ontmetaBat, ontclassSet); + } GDKfree(typeValues); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - Store type hierarchy for type values
Changeset: da0c1bd43bd3 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=da0c1bd43bd3 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h Branch: rdf Log Message: Store type hierarchy for type values Suggested by Peter Instead of storing the leaf value per subject, store the whole hierarchy. By doing so, the frequencies are summed up on the more general levels of the hierarchy. For example, 40% Politicians and 50% Athletes in a CS will be representented as (90% Thing, 90% Agent, 90% Person, 50% Athlete, 40% Politician), resulting in label candidate Person when threshold is set to 80%. diffs (173 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -851,47 +851,17 @@ int compareTypeAttributesFreqs (const vo #endif #if USE_TYPE_NAMES -/* Analyze hierarchy in a list of type values, add all leaf values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ +/* Add type values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ static -void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, OntClass *ontclassSet) { - int i, j, k; +void insertValuesIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) { + int i, j; int fit; - char*leaf; // flag whether a type value in 'typeList' is a leaf (1) or not (0) - BUN pos; - OntClasshierarchy; - - // start with: every type value is a leaf - leaf = GDKmalloc(sizeof(char) * typeListLength); - for (i = 0; i typeListLength; ++i) leaf[i] = 1; - - // analyze hierarchy + for (i = 0; i typeListLength; ++i) { - if (!leaf[i]) continue; - pos = BUNfnd(BATmirror(ontmetaBat), typeList[i]); - if (pos == BUN_NONE) { - // no ontology information for this type value, therefore it is not added to the hierarchy - leaf[i] = 0; - continue; - } - - // get hierarchy of this type value - hierarchy = ontclassSet[pos]; - - // loop over superclasses, set leaf=0 - for (j = 0; j hierarchy.numsc; ++j) { - for (k = 0; k typeListLength; ++k) { - if (i == k) continue; - if (ontclassSet[hierarchy.scIdxes[j]].cOid == typeList[k]) { - // found superclass at position 'k' - leaf[k] = 0; - } - } - } - } - - // add all leafs to the histogram - for (i = 0; i typeListLength; ++i) { - if (!leaf[i]) continue; + BUN pos = BUNfnd(BATmirror(ontmetaBat), typeList[i]); + if (pos == BUN_NONE) continue; // no ontology information, ignore + + // add to histogram fit = 0; for (j = 0; j typeAttributesHistogramCount[csFreqIdx][type]; ++j) { if (typeAttributesHistogram[csFreqIdx][type][j].value == typeList[i]) { @@ -913,13 +883,11 @@ void insertLeafsIntoTypeAttributesHistog typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].freq = 1; } } - - GDKfree(leaf); } /* Loop through all subjects to collect frequency statistics for type attribute values. */ static -void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass *ontclassSet) { +void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) { // looping, extracting BUN p, q; oid *sbt, *obt, *pbt; @@ -987,7 +955,7 @@ void createTypeAttributesHistogram(BAT * } else { // analyze values and add to
MonetDB: rdf - Improve label quality
Changeset: 3e4ece2b7085 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3e4ece2b7085 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: Improve label quality - Computation of similarity between CS's and classes is now based on the assumption that all properties of a CS should belong to one ontology class, not that the CS has to consist of ALL properties of the corresponding ontology class. - Type values are usually multi-valued properties, the values represent the hierarchy the subject belongs to (e.g., if a subject in the dbpedia dataset has type 'Athlete', it also has types 'Person', 'Agent', 'Thing'). This hierarchy is analyzed and only the most specific type value (the leaf) is added to the data structures. This improves the label candidates that are computed using type values. diffs (284 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -851,19 +851,89 @@ int compareTypeAttributesFreqs (const vo #endif #if USE_TYPE_NAMES +/* Analyze hierarchy in a list of type values, add all leaf values to the histogram. Values that are not present in the hierarchy tree built from the ontologies are NOT added to the histogram. */ +static +void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, OntClass *ontclassSet) { + int i, j, k; + int fit; + char*leaf; // flag whether a type value in 'typeList' is a leaf (1) or not (0) + BUN pos; + OntClasshierarchy; + + // start with: every type value is a leaf + leaf = GDKmalloc(sizeof(char) * typeListLength); + for (i = 0; i typeListLength; ++i) leaf[i] = 1; + + // analyze hierarchy + for (i = 0; i typeListLength; ++i) { + if (!leaf[i]) continue; + pos = BUNfnd(BATmirror(ontmetaBat), typeList[i]); + if (pos == BUN_NONE) { + // no ontology information for this type value, therefore it is not added to the hierarchy + leaf[i] = 0; + continue; + } + + // get hierarchy of this type value + hierarchy = ontclassSet[pos]; + + // loop over superclasses, set leaf=0 + for (j = 0; j hierarchy.numsc; ++j) { + for (k = 0; k typeListLength; ++k) { + if (i == k) continue; + if (ontclassSet[hierarchy.scIdxes[j]].cOid == typeList[k]) { + // found superclass at position 'k' + leaf[k] = 0; + } + } + } + } + + // add all leafs to the histogram + for (i = 0; i typeListLength; ++i) { + if (!leaf[i]) continue; + fit = 0; + for (j = 0; j typeAttributesHistogramCount[csFreqIdx][type]; ++j) { + if (typeAttributesHistogram[csFreqIdx][type][j].value == typeList[i]) { + // bucket exists + typeAttributesHistogram[csFreqIdx][type][j].freq += 1; + fit = 1; + break; + } + } + if (!fit) { + // bucket does not exist + // realloc + typeAttributesHistogramCount[csFreqIdx][type] += 1; + typeAttributesHistogram[csFreqIdx][type] = (TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][type], sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][type]); + if (!typeAttributesHistogram[csFreqIdx][type]) fprintf(stderr, ERROR: Couldn't realloc memory!\n); + + // insert value + typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].value = typeList[i]; + typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type] - 1].freq = 1; + } + } + + GDKfree(leaf); +} + /* Loop through all subjects to collect frequency statistics for type attribute values. */ static -void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, char** typeAttributes) { +void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter oi, oid *subjCSMap,
MonetDB: rdf - Workaround for memory bug: increase INIT_NUM_CS
Changeset: 2d593f5bbd8f for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2d593f5bbd8f Modified Files: monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: Workaround for memory bug: increase INIT_NUM_CS diffs (12 lines): diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -169,7 +169,7 @@ typedef struct SubCSSet{ int numAllocation; } SubCSSet; -#define INIT_NUM_CS 100 +#define INIT_NUM_CS 9 // workaround #define SIM_THRESHOLD 0.6 #define SIM_TFIDF_THRESHOLD 0.55 #define IMPORTANCE_THRESHOLD 0.01 ___ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - Remove duplicate values in candidate lists and up...
Changeset: 6cc339a6347d for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6cc339a6347d Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: Remove duplicate values in candidate lists and update candidate lists when merging CS's. diffs (truncated from 316 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -108,6 +108,10 @@ typedef struct CSlabel { oid name; // table name oid *candidates;// list of table name candidates, candidates[0] == name int candidatesCount;// number of entries in the candidates list + int candidatesNew; // number of candidates that are created during merging (e.g. ancestor name) + int candidatesOntology; // number of ontology candidates (first category) + int candidatesType; // number of type candidates (second category) + int candidatesFK; // number of fk candidates (third category) oid *hierarchy; // hierarchy bottom to top int hierarchyCount; // number of entries in the hierarchy list int numProp;// number of properties, copied from freqCSset-items[x].numProp diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1831,6 +1831,50 @@ oid* getOntoHierarchy(oid ontology, int* return hierarchy; } +/* Remove duplicated candidate values and remove DUMMY values if better candidates exist + */ +static +void removeDuplicatedCandidates(CSlabel *label) { + int i, j; + int cNew = label-candidatesNew, cOnto = label-candidatesOntology, cType = label-candidatesType, cFK = label-candidatesFK; + + if (label-candidatesCount 2) return; // no duplicates + + // loop through all candidates + for (i = 0; i label-candidatesCount - 1; ++i) { + // search (direction: right) whether this value occurs again + int moveLeft = 0; + for (j = i + 1; j label-candidatesCount; ++j) { + // find out which category (new, onto, type, fk) we are in + int *cPtr = NULL; + if (j label-candidatesNew) cPtr = cNew; + else if (j label-candidatesNew + label-candidatesOntology) cPtr = cOnto; + else if (j label-candidatesNew + label-candidatesOntology + label-candidatesType) cPtr = cType; + else cPtr = cFK; + + if (label-candidates[i] == label-candidates[j] || label-candidates[j] == BUN_NONE) { + // DUMMY value will be overwritten + // OR: + // value occurs again, will be overwritten + moveLeft++; + (*cPtr)--; + } else { + // different value, keep it + label-candidates[j - moveLeft] = label-candidates[j]; + } + } + // value 'i' is unique now + // update counts + label-candidatesCount -= moveLeft; + label-candidatesNew = cNew; + label-candidatesOntology = cOnto; + label-candidatesType = cType; + label-candidatesFK = cFK; + } + + // DUMMY value on position 0 is kept to ensure that name == candidates[0] +} + #if USE_TABLE_NAME /* For one CS: Choose the best table name out of all collected candidates (ontology, type, fk). */ static @@ -1843,6 +1887,7 @@ void getTableName(CSlabel* label, int cs // --- ONTOLOGY --- // add all ontology candidates to list of candidates if (resultCount[csIdx] = 1) { + label-candidatesOntology = resultCount[csIdx]; label-candidates = GDKrealloc(label-candidates, sizeof(oid) * (label-candidatesCount + resultCount[csIdx])); if (!label-candidates) fprintf(stderr, ERROR: Couldn't realloc memory!\n); for (i = 0; i resultCount[csIdx]; ++i) { @@ -1925,6 +1970,7 @@ void getTableName(CSlabel* label, int cs // add all most frequent type values to list of candidates if (tmpListCount = 1) { int counter = 0; + label-candidatesType = tmpListCount; label-candidates = GDKrealloc(label-candidates, sizeof(oid) * (label-candidatesCount + tmpListCount)); if (!label-candidates) fprintf(stderr, ERROR: Couldn't realloc memory!\n); for (i = 0; i typeStatCount; ++i) { @@ -1965,6 +2011,7 @@ void
MonetDB: rdf - Remove dummy values from candidate lists and upda...
Changeset: 5caad43d9d63 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=5caad43d9d63 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: Remove dummy values from candidate lists and update CS name accordingly DUMMY values on position 0 of the candidate list will be removed. Therefore, CS names have to be updated to ensure that (candidates[0] == name). diffs (69 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1872,7 +1872,24 @@ void removeDuplicatedCandidates(CSlabel label-candidatesFK = cFK; } - // DUMMY value on position 0 is kept to ensure that name == candidates[0] + // remove DUMMY value on position 0 + if (label-candidates[0] == BUN_NONE label-candidatesCount 1) { + for (i = 1; i label-candidatesCount; ++i) { + label-candidates[i - 1] = label-candidates[i]; + } + label-candidatesCount--; + + // update value in category; + if (label-candidatesNew 0) { + label-candidatesNew--; + } else if (label-candidatesOntology 0) { + label-candidatesOntology--; + } else if (label-candidatesType 0) { + label-candidatesType--; + } else { + label-candidatesFK--; + } + } } #if USE_TABLE_NAME @@ -2624,6 +2641,9 @@ str updateLabel(int ruleNumber, CSset *f label-candidatesType = candidatesType; label-candidatesFK = candidatesFK; removeDuplicatedCandidates(label); + if (label-name == BUN_NONE label-candidates[0] != BUN_NONE) { + label-name = label-candidates[0]; + } // hierarchy if ((*labels)[freqCS1].name == label-name) { @@ -2667,6 +2687,9 @@ str updateLabel(int ruleNumber, CSset *f label-candidatesType = candidatesType; label-candidatesFK = candidatesFK; removeDuplicatedCandidates(label); + if (label-name == BUN_NONE label-candidates[0] != BUN_NONE) { + label-name = label-candidates[0]; + } // hierarchy freqCS1Counter = (*labels)[freqCS1].hierarchyCount - 1; @@ -2699,6 +2722,9 @@ str updateLabel(int ruleNumber, CSset *f label-candidatesType = candidatesType; label-candidatesFK = candidatesFK; removeDuplicatedCandidates(label); + if (label-name == BUN_NONE label-candidates[0] != BUN_NONE) { + label-name = label-candidates[0]; + } // hierarchy already set // properties already set @@ -2728,6 +2754,9 @@ str updateLabel(int ruleNumber, CSset *f label-candidatesType = candidatesType; label-candidatesFK = candidatesFK; removeDuplicatedCandidates(label); + if (label-name == BUN_NONE label-candidates[0] != BUN_NONE) { + label-name = label-candidates[0]; + } // hierarchy label-hierarchyCount = big.hierarchyCount; ___ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - Fix use of short and long names in GraphViz export
Changeset: fb94f422a1f0 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fb94f422a1f0 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: Fix use of short and long names in GraphViz export diffs (109 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1664,6 +1664,7 @@ void printUML2(CSset *freqCSset, CSlabel for (i = 0; i freqCSset-numCSadded; ++i) { int width; str labelStr; + str tmpStr; str labelStrEscaped = NULL; #if USE_SHORT_NAMES str labelStrShort = NULL; @@ -1682,7 +1683,8 @@ void printUML2(CSset *freqCSset, CSlabel if (!labelStrEscaped) fprintf(stderr, ERROR: Couldn't malloc memory!\n); strcpy(labelStrEscaped, DUMMY); } else { - takeOid(labels[i].name, labelStr); + takeOid(labels[i].name, tmpStr); + labelStr = removeBrackets(tmpStr); #if USE_SHORT_NAMES getPropNameShort(labelStrShort, labelStr); labelStrEscaped = (str) GDKmalloc(sizeof(char) * (strlen(labelStrShort) + 1)); @@ -1707,19 +1709,13 @@ void printUML2(CSset *freqCSset, CSlabel str propStr; str tmpStr; char*propStrEscaped = NULL; +#if USE_SHORT_NAMES char*propStrShort = NULL; +#endif str color; takeOid(cs.lstProp[j], tmpStr); - // copy propStr to propStrEscaped because .dot-PORTs cannot contain colons and quotes - propStr = removeBrackets(tmpStr); - propStrEscaped = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); - if (!propStrEscaped) fprintf(stderr, ERROR: Couldn't malloc memory!\n); - memcpy(propStrEscaped, propStr, (strlen(propStr) + 1)); - escapeURI(propStrEscaped); - getPropNameShort(propStrShort, propStr); - // assign color (the more tuples the property occurs in, the darker if ((1.0 * cs.lstPropSupport[j])/cs.support 0.8) { color = #FF; @@ -1732,10 +1728,22 @@ void printUML2(CSset *freqCSset, CSlabel } else { color = #FF; } + + // copy propStr to propStrEscaped because .dot-PORTs cannot contain colons and quotes + propStr = removeBrackets(tmpStr); + propStrEscaped = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); + if (!propStrEscaped) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + memcpy(propStrEscaped, propStr, (strlen(propStr) + 1)); + escapeURI(propStrEscaped); +#if USE_SHORT_NAMES + getPropNameShort(propStrShort, propStr); fprintf(fout, TRTD BGCOLOR=\%s\ PORT=\%s\%s (%d%%)/TD/TR\n, color, propStrEscaped, propStrShort, (100 * cs.lstPropSupport[j])/cs.support); + GDKfree(propStrShort); +#else + fprintf(fout, TRTD BGCOLOR=\%s\ PORT=\%s\%s (%d%%)/TD/TR\n, color, propStrEscaped, propStrEscaped, (100 * cs.lstPropSupport[j])/cs.support); +#endif GDKfree(propStr); - GDKfree(propStrShort); free(propStrEscaped); GDKfree(tmpStr); @@ -1752,6 +1760,9 @@ void printUML2(CSset *freqCSset, CSlabel str tmpStr; str propStr; char*propStrEscaped = NULL; +#if USE_SHORT_NAMES + char*propStrShort = NULL; +#endif takeOid(cs.lstProp[j], tmpStr); @@ -1762,15 +1773,30 @@ void printUML2(CSset *freqCSset, CSlabel memcpy(propStrEscaped, propStr, (strlen(propStr) + 1)); escapeURI(propStrEscaped); +#if USE_SHORT_NAMES + getPropNameShort(propStrShort, propStr); for (k = 0; k relationMetadataCount[i][j]; ++k) { if (relationMetadata[i][j][k].percent = FK_FREQ_THRESHOLD) { // target of links is frequent enough, not an outlier int from = relationMetadata[i][j][k].from; int to = relationMetadata[i][j][k].to; - fprintf(fout, \BUNFMT\:\%s\ - \BUNFMT\ [label=\%s\];\n,
MonetDB: rdf - Update labels when CS's are merged
Changeset: 2b0ab4777950 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2b0ab4777950 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: Update labels when CS's are merged Updates label, hierarchy and properties. Does not update candidates yet. diffs (truncated from 851 to 300 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1790,7 +1790,7 @@ oid* getOntoHierarchy(oid ontology, int* // add 'ontology' to hierarchy (*hierarchyCount) = 1; - hierarchy = (oid *) malloc(sizeof(oid) * (*hierarchyCount)); + hierarchy = (oid *) GDKmalloc(sizeof(oid) * (*hierarchyCount)); if (!hierarchy) fprintf(stderr, ERROR: Couldn't malloc memory!\n); hierarchy[(*hierarchyCount) -1] = ontology; @@ -1815,7 +1815,7 @@ oid* getOntoHierarchy(oid ontology, int* // superclass // add 'msuperstr' to hierarchy (*hierarchyCount) += 1; - hierarchy = realloc(hierarchy, sizeof(oid) * (*hierarchyCount)); + hierarchy = GDKrealloc(hierarchy, sizeof(oid) * (*hierarchyCount)); if (!hierarchy) fprintf(stderr, ERROR: Couldn't realloc memory!\n); hierarchy[(*hierarchyCount) -1] = msuper; @@ -1843,7 +1843,7 @@ void getTableName(CSlabel* label, int cs // --- ONTOLOGY --- // add all ontology candidates to list of candidates if (resultCount[csIdx] = 1) { - label-candidates = realloc(label-candidates, sizeof(oid) * (label-candidatesCount + resultCount[csIdx])); + label-candidates = GDKrealloc(label-candidates, sizeof(oid) * (label-candidatesCount + resultCount[csIdx])); if (!label-candidates) fprintf(stderr, ERROR: Couldn't realloc memory!\n); for (i = 0; i resultCount[csIdx]; ++i) { label-candidates[label-candidatesCount + i] = result[csIdx][i]; @@ -1925,7 +1925,7 @@ void getTableName(CSlabel* label, int cs // add all most frequent type values to list of candidates if (tmpListCount = 1) { int counter = 0; - label-candidates = realloc(label-candidates, sizeof(oid) * (label-candidatesCount + tmpListCount)); + label-candidates = GDKrealloc(label-candidates, sizeof(oid) * (label-candidatesCount + tmpListCount)); if (!label-candidates) fprintf(stderr, ERROR: Couldn't realloc memory!\n); for (i = 0; i typeStatCount; ++i) { for (j = 0; j tmpListCount; ++j) { @@ -1965,7 +1965,7 @@ void getTableName(CSlabel* label, int cs // --- FK --- // add top3 fk values to list of candidates if (links[csIdx].num 0) { - label-candidates = realloc(label-candidates, sizeof(oid) * (label-candidatesCount + MIN(3, links[csIdx].num))); + label-candidates = GDKrealloc(label-candidates, sizeof(oid) * (label-candidatesCount + MIN(3, links[csIdx].num))); if (!label-candidates) fprintf(stderr, ERROR: Couldn't realloc memory!\n); for (i = 0; i MIN(3, links[csIdx].num); ++i) { label-candidates[label-candidatesCount + i] = links[csIdx].fks[0].prop; @@ -1983,7 +1983,7 @@ void getTableName(CSlabel* label, int cs // --- NOTHING --- if (label-candidatesCount == 0) { - label-candidates = realloc(label-candidates, sizeof(oid)); + label-candidates = GDKrealloc(label-candidates, sizeof(oid)); if (!label-candidates) fprintf(stderr, ERROR: Couldn't realloc memory!\n); label-candidates[0] = BUN_NONE; label-candidatesCount = 1; @@ -2004,7 +2004,7 @@ CSlabel* initLabels(CSset *freqCSset) { CSlabel *labels; int i; - labels = (CSlabel *) malloc(sizeof(CSlabel) * freqCSset-numCSadded); + labels = (CSlabel *) GDKmalloc(sizeof(CSlabel) * freqCSset-numCSadded); if (!labels) fprintf(stderr, ERROR: Couldn't malloc memory!\n); for (i = 0; i freqCSset-numCSadded; ++i) { labels[i].candidates = NULL; @@ -2031,7 +2031,7 @@ void getAllLabels(CSlabel* labels, CSset // copy attribute oids (names) labels[i].numProp = cs.numProp; - labels[i].lstProp = (oid *) malloc(sizeof(oid) * cs.numProp); + labels[i].lstProp = (oid *) GDKmalloc(sizeof(oid) * cs.numProp);
MonetDB: rdf - Store oids instead of strings to improve performance
Changeset: eb32228c325e for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=eb32228c325e Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfontologyload.c monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: Store oids instead of strings to improve performance Store oids during the labeling process, transform them into strings for export only. URI string format: http:/// diffs (truncated from 2090 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -105,13 +105,13 @@ typedef enum { // Final data structure that stores the labels for tables and attributes typedef struct CSlabel { - str name; // table name - str *candidates;// list of table name candidates, candidates[0] == name + oid name; // table name + oid *candidates;// list of table name candidates, candidates[0] == name int candidatesCount;// number of entries in the candidates list - str *hierarchy; // hierarchy bottom to top + oid *hierarchy; // hierarchy bottom to top int hierarchyCount; // number of entries in the hierarchy list int numProp;// number of properties, copied from freqCSset-items[x].numProp - char**lstProp; // attribute names (same order as in freqCSset-items[x].lstProp) + oid *lstProp; // attribute names (same order as in freqCSset-items[x].lstProp) } CSlabel; #endif /* _RDF_H_ */ diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -27,79 +27,79 @@ // list of known ontologies int ontologyCount = 73; ontology ontologies[] = { -{{http:, www.facebook.com, 2008}, 3}, -{{http:, facebook.com, 2008}, 3}, -{{http:, developers.facebook.com, schema}, 3}, -{{https:, www.facebook.com, 2008}, 3}, -{{http:, purl.org, dc, elements, 1.1}, 5}, // dc DublinCore -{{http:, purl.org, dc, terms}, 4}, // DublinCore -{{http:, purl.org, goodrelations, v1}, 4}, // GoodRelations -{{http:, purl.org, rss, 1.0, modules}, 5}, -{{http:, purl.org, stuff}, 3}, -{{http:, www.purl.org, stuff}, 3}, -{{http:, ogp.me, ns}, 3}, -{{https:, ogp.me, ns}, 3}, -{{http:, www.w3.org, 1999, 02, 22-rdf-syntax-ns}, 5}, // rdf -{{http:, www.w3.org, 2000, 01, rdf-schema}, 5}, // rdfs -{{http:, www.w3.org, 2004, 02, skos, core}, 6}, // skos (Simple Knowledge Organization System) -{{http:, www.w3.org, 2002, 07, owl}, 5}, -{{http:, www.w3.org, 2006, vcard, ns}, 5}, // vcard -{{http:, www.w3.org, 2001, vcard-rdf, 3.0}, 5}, -{{http:, www.w3.org, 2003, 01, geo, wgs84_pos}, 6}, // geo -{{http:, www.w3.org, 1999, xhtml, vocab}, 5}, // xhtml -{{http:, search.yahoo.com, searchmonkey}, 3}, -{{https:, search.yahoo.com, searchmonkey}, 3}, -{{http:, search.yahoo.co.jp, searchmonkey}, 3}, -{{http:, g.yahoo.com, searchmonkey}, 3}, -{{http:, opengraphprotocol.org, schema}, 3}, -{{https:, opengraphprotocol.org, schema}, 3}, -{{http:, opengraph.org, schema}, 3}, -{{https:, opengraph.org, schema}, 3}, -{{http:, creativecommons.org, ns}, 3}, // cc -{{http:, rdf.data-vocabulary.org}, 2}, // by google -{{http:, rdfs.org, sioc, ns}, 4}, // sioc (pronounced shock, Semantically-Interlinked Online Communities Project) -{{http:, xmlns.com, foaf, 0.1}, 4}, // foaf (Friend of a Friend) -{{http:, mixi-platform.com, ns}, 3}, // japanese social graph -{{http:, commontag.org, ns}, 3}, -{{http:, semsl.org, ontology}, 3}, // semantic web for second life -{{http:, schema.org}, 2}, -{{http:, openelectiondata.org, 0.1}, 3}, -{{http:, search.aol.com, rdf}, 3}, -{{http:, www.loc.gov, loc.terms, relators}, 4}, // library of congress -{{http:, dbpedia.org, ontology}, 3}, // dbo -{{http:, dbpedia.org, resource}, 3}, // dbpedia -{{http:, dbpedia.org, property}, 3}, // dbp -{{http:, www.aktors.org, ontology, portal}, 4}, // akt (research, publications, ...) -{{http:, purl.org, ontology, bibo}, 4}, // bibo (bibliography) -{{http:, purl.org, ontology, mo}, 4}, // mo (music) -{{http:, www.geonames.org, ontology}, 3}, // geonames -{{http:, purl.org, vocab, frbr, core}, 5}, // frbr (Functional Requirements for Bibliographic Records) -{{http:, www.w3.org, 2001, XMLSchema}, 4}, // xsd -{{http:, www.w3.org, 2006, time}, 4}, // time -{{http:, purl.org, NET, c4dm, event.owl}, 5}, // event -{{http:, www.openarchives.org, ore, terms}, 4}, // ore (Open Archive) -{{http:, purl.org, vocab, bio, 0.1}, 5}, // bio (biographical data) -{{http:, www.holygoat.co.uk, owl, redwood, 0.1, tags}, 6}, // tag -{{http:, rdfs.org, ns, void}, 4}, // void (Vocabulary of Interlinked Datasets) -{{http:, www.w3.org, 2006, http}, 4}, //
MonetDB: rdf - do not use getPropNameShort during labeling
Changeset: 3ed3276b486d for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3ed3276b486d Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: do not use getPropNameShort during labeling short names caused an error in creating the hierarchy diffs (164 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1612,9 +1612,6 @@ void printUML2(CSset *freqCSset, CSlabel for (j = 0; j cs.numProp; ++j) { char*propStrEscaped = NULL; -#if USE_SHORT_NAMES - char*propStrShort = NULL; -#endif takeOid(cs.lstProp[j], tmpStr); @@ -1624,9 +1621,6 @@ void printUML2(CSset *freqCSset, CSlabel if (!propStrEscaped) fprintf(stderr, ERROR: Couldn't malloc memory!\n); memcpy(propStrEscaped, propStr, (strlen(propStr) + 1)); escapeURI(propStrEscaped); -#if USE_SHORT_NAMES - getPropNameShort(propStrShort, propStr); -#endif for (k = 0; k relationMetadataCount[i][j]; ++k) { @@ -1634,11 +1628,7 @@ void printUML2(CSset *freqCSset, CSlabel // target of links is frequent enough, not an outlier int from = relationMetadata[i][j][k].from; int to = relationMetadata[i][j][k].to; -#if USE_SHORT_NAMES - fprintf(fout, \BUNFMT\:\%s\ - \BUNFMT\ [label=\%s\];\n, freqCSset-items[from].csId, propStrEscaped, freqCSset-items[to].csId, propStrShort); // print foreign keys to dot file -#else fprintf(fout, \BUNFMT\:\%s\ - \BUNFMT\ [label=\%s\];\n, freqCSset-items[from].csId, propStrEscaped, freqCSset-items[to].csId, propStr); // print foreign keys to dot file -#endif } } GDKfree(tmpStr); @@ -1701,7 +1691,6 @@ str* getOntoHierarchy(str ontology, int* foundTop = 1; } } - return hierarchy; } @@ -1728,12 +1717,8 @@ void getTableName(CSlabel* label, int cs // one ontology class -- use it if (resultCount[csIdx] == 1) { -#if USE_SHORT_NAMES - getPropNameShort((label-name), result[csIdx][0]); -#else label-name = (char *) malloc(sizeof(char) * (strlen(result[csIdx][0]) + 1)); strcpy(label-name, result[csIdx][0]); -#endif label-hierarchy = getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, ontmetadataCount); nameFound = 1; } @@ -1762,12 +1747,8 @@ void getTableName(CSlabel* label, int cs // only one left -- use it if (tmpListCount == 1) { -#if USE_SHORT_NAMES - getPropNameShort((label-name), tmpList[0]); -#else label-name = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); strcpy(label-name, tmpList[0]); -#endif label-hierarchy = getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, ontmetadataCount); free(tmpList); nameFound = 1; @@ -1776,12 +1757,8 @@ void getTableName(CSlabel* label, int cs if (!nameFound) { // multiple left -- use the class that covers most attributes, most popular ontology, ... if (tmpListCount 1) { -#if USE_SHORT_NAMES - getPropNameShort((label-name), tmpList[0]); // sorted -#else label-name = (char *) malloc(sizeof(char) * (strlen(tmpList[0]) + 1)); strcpy(label-name, tmpList[0]); // sorted -#endif label-hierarchy = getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, ontmetadataCount); free(tmpList); nameFound = 1; @@ -1790,12 +1767,8 @@ void getTableName(CSlabel* label, int cs if (!nameFound) { // empty intersection - use the class that covers most attributes, most popular ontology, .. -#if USE_SHORT_NAMES - getPropNameShort((label-name), result[csIdx][0]); // sorted -#else label-name = (char *) malloc(sizeof(char) * (strlen(result[csIdx][0]) + 1)); strcpy(label-name, result[csIdx][0]); // sorted -#endif label-hierarchy =
MonetDB: rdf - Add list of candidates for each CSlabel
Changeset: 9aa7d8033c08 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9aa7d8033c08 Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: Add list of candidates for each CSlabel Beside the name, a list of label candidates is stored for each CSlabel. The candidates are used by the CS merging algorithm. diffs (truncated from 450 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -106,6 +106,8 @@ typedef enum { // Final data structure that stores the labels for tables and attributes typedef struct CSlabel { str name; // table name + str *candidates;// list of table name candidates, candidates[0] == name + int candidatesCount;// number of entries in the candidates list str *hierarchy; // hierarchy bottom to top int hierarchyCount; // number of entries in the hierarchy list int numProp;// number of properties, copied from freqCSset-items[x].numProp diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1712,8 +1712,20 @@ void getTableName(CSlabel* label, int cs int i, j, k; str *tmpList; int tmpListCount; + charnameFound = 0; // --- ONTOLOGY --- + // add all ontology candidates to list of candidates + if (resultCount[csIdx] = 1) { + label-candidates = realloc(label-candidates, sizeof(str) * (label-candidatesCount + resultCount[csIdx])); + if (!label-candidates) fprintf(stderr, ERROR: Couldn't realloc memory!\n); + for (i = 0; i resultCount[csIdx]; ++i) { + label-candidates[label-candidatesCount + i] = (char *) malloc(sizeof(char) * (strlen(result[csIdx][i]) + 1)); + strcpy(label-candidates[label-candidatesCount + i], result[csIdx][i]); + } + label-candidatesCount += resultCount[csIdx]; + } + // one ontology class -- use it if (resultCount[csIdx] == 1) { #if USE_SHORT_NAMES @@ -1721,65 +1733,74 @@ void getTableName(CSlabel* label, int cs #else label-name = (char *) malloc(sizeof(char) * (strlen(result[csIdx][0]) + 1)); strcpy(label-name, result[csIdx][0]); +#endif label-hierarchy = getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, ontmetadataCount); -#endif - return; + nameFound = 1; } - // multiple ontology classes -- intersect with types - if (resultCount[csIdx] 1) { - tmpList = NULL; - tmpListCount = 0; - // search for type values - for (i = 0; i typeAttributesCount; ++i) { - for (j = 0; j typeAttributesHistogramCount[csIdx][i]; ++j) { - if (typeAttributesHistogram[csIdx][i][j].percent TYPE_FREQ_THRESHOLD) break; // sorted - // intersect type with ontology classes - for (k = 0; k resultCount[csIdx]; ++k) { - if (strcmp(result[csIdx][k], typeAttributesHistogram[csIdx][i][j].value) == 0) { - // found, copy ontology class to tmpList - tmpList = (str *) realloc(tmpList, sizeof(str) * (tmpListCount + 1)); - if (!tmpList) fprintf(stderr, ERROR: Couldn't realloc memory!\n); - tmpList[tmpListCount] = result[csIdx][k]; // pointer, no copy - tmpListCount += 1; + if (!nameFound) { + // multiple ontology classes -- intersect with types + if (resultCount[csIdx] 1) { + tmpList = NULL; + tmpListCount = 0; + // search for type values + for (i = 0; i typeAttributesCount; ++i) { + for (j = 0; j typeAttributesHistogramCount[csIdx][i]; ++j) { + if (typeAttributesHistogram[csIdx][i][j].percent TYPE_FREQ_THRESHOLD) break; // sorted + // intersect type with ontology classes + for (k = 0; k resultCount[csIdx]; ++k) { + if (strcmp(result[csIdx][k], typeAttributesHistogram[csIdx][i][j].value) == 0) { + // found, copy ontology class to
MonetDB: rdf - Add ontology tree
Changeset: 4f9d12a701c4 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4f9d12a701c4 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: Add ontology tree Stores distribution of data, used for CS merging diffs (truncated from 331 to 300 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -1656,7 +1656,6 @@ void printUML2(CSset *freqCSset, CSlabel TKNZRclose(ret); } -#if USE_TABLE_NAME static str* getOntoHierarchy(str ontology, int* hierarchyCount, str** ontmetadata, int ontmetadataCount) { int i; @@ -1677,7 +1676,7 @@ str* getOntoHierarchy(str ontology, int* // lookup superclass int foundTuple = 0; for (i = 0; i ontmetadataCount; ++i) { - str muristr = ontmetadata[0][i]; + str muristr = ontmetadata[0][i]; str msuperstr = ontmetadata[1][i]; if (strcmp(hierarchy[(*hierarchyCount) - 1], muristr) == 0) { // found entry @@ -1707,8 +1706,6 @@ str* getOntoHierarchy(str ontology, int* return hierarchy; } -#endif - #if USE_TABLE_NAME /* For one CS: Choose the best table name out of all collected candidates (ontology, type, fk). */ @@ -1972,6 +1969,182 @@ void createLinks(CSset* freqCSset, Relat #endif static +void createOntoUsageTreeStatistics(OntoUsageNode* tree, int numTuples) { + int i; + + if (tree-numChildren == 0) { + // leaf node + tree-numOccurancesSum = tree-numOccurances; + tree-percentage = (1.0 * tree-numOccurancesSum) / numTuples; + } else { + // inner node + tree-numOccurancesSum = tree-numOccurances; + for (i = 0; i tree-numChildren; ++i) { + createOntoUsageTreeStatistics(tree-lstChildren[i], numTuples); + // sum up data + tree-numOccurancesSum += tree-lstChildren[i]-numOccurancesSum; + } + tree-percentage = (1.0 * tree-numOccurancesSum) / numTuples; + } +} + +static +void addToOntoUsageTree(OntoUsageNode* tree, str* hierarchy, int hierarchyCount, int numTuples) { + int i; + str uri; + OntoUsageNode *leaf; + + if (hierarchyCount == 0) { + // found position in tree +// tree-numOccurances += numTuples; // TODO cs.support not yet available + tree-numOccurances += 1; + return; + } + + // search through children + uri = hierarchy[hierarchyCount - 1]; + hierarchyCount--; + for (i = 0; i tree-numChildren; ++i) { + if (strcmp(tree-lstChildren[i]-uri, uri) == 0) { + // found + addToOntoUsageTree(tree-lstChildren[i], hierarchy, hierarchyCount, numTuples); + return; + } + } + + // child not found + // create leaf + leaf = (OntoUsageNode *) malloc(sizeof(OntoUsageNode)); + if (!leaf) + fprintf(stderr, ERROR: Couldn't malloc memory!\n); + leaf-parent = tree; + leaf-uri = (str) malloc(sizeof(char) * (strlen(uri) + 1)); + if (!leaf-uri) + fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strcpy(leaf-uri, uri); + leaf-lstChildren = NULL; + leaf-numChildren = 0; + leaf-numOccurances = 0; + leaf-numOccurancesSum = 0; + leaf-percentage = 0.0; + // add to tree + tree-numChildren++; + tree-lstChildren = realloc(tree-lstChildren, sizeof(OntoUsageNode *) * tree-numChildren); + if (!tree-lstChildren) + fprintf(stderr, ERROR: Couldn't realloc memory!\n); + tree-lstChildren[tree-numChildren - 1] = leaf; + // call + addToOntoUsageTree(leaf, hierarchy, hierarchyCount, numTuples); +} + + +static +void printTree(OntoUsageNode* tree, int level) { + int i; + printf(Level %d URI %s Count %d Sum %d Percent %.1f\n, level, tree-uri, tree-numOccurances, tree-numOccurancesSum, tree-percentage * 100); + for (i = 0; i tree-numChildren; ++i) { + printTree(tree-lstChildren[i], level+1); + } +} + +static +void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, str** ontmetadata, int ontmetadataCount, str** result, int* resultCount, int typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount) { + int i, j, k; + str *tmpList; + int tmpListCount; + int numTuples = 0; + + // init tree with an artifical root node +
MonetDB: rdf - First draft of createFinalLabels, including new U...
Changeset: 8c25b051ed3a for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8c25b051ed3a Modified Files: monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message: First draft of createFinalLabels, including new UML diagram generation diffs (truncated from 973 to 300 lines): diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -103,4 +103,13 @@ typedef enum { #define N_GRAPH_BAT (MAP_LEX+1) +// Final data structure that stores the labels for tables and attributes +typedef struct CSlabel { + str name; // table name + str *hierarchy; // hierarchy bottom to top + int hierarchyCount; // number of entries in the hierarchy list + int numProp;// number of properties, copied from freqCSset-items[x].numProp + char**lstProp; // attribute names (same order as in freqCSset-items[x].lstProp) +} CSlabel; + #endif /* _RDF_H_ */ diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -300,6 +300,73 @@ Relation*** initRelationMetadata(int** r return relationMetadata; } +/* Calculate frequency per foreign key relationship. */ +static +Relation*** initRelationMetadata2(int** relationMetadataCount, CSmergeRel* csRelBetweenMergeFreqSet, CSset* freqCSset) { + int i, j, k; + Relation*** relationMetadata; + + int ret; + char* schema = rdf; + + TKNZRopen (NULL, schema); + + relationMetadata = (Relation ***) malloc(sizeof(Relation **) * freqCSset-numCSadded); + if (!relationMetadata) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + for (i = 0; i freqCSset-numCSadded; ++i) { // CS + CS cs; + if (i == -1) continue; // ignore + cs = (CS) freqCSset-items[i]; + relationMetadata[i] = (Relation **) malloc (sizeof(Relation *) * cs.numProp); + if (!relationMetadata[i]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + for (j = 0; j cs.numProp; ++j) { // propNo in CS order + int sum = 0; + relationMetadataCount[i][j] = 0; + relationMetadata[i][j] = NULL; + for (k = 0; k csRelBetweenMergeFreqSet[i].numRef; ++k) { // propNo in CSrel + + if (csRelBetweenMergeFreqSet[i].lstPropId[k] == cs.lstProp[j]) { + int toId = csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k]; + if (toId == -1) continue; // ignore + relationMetadataCount[i][j] += 1; + + // alloc/realloc + if (relationMetadataCount[i][j] == 1) { + // alloc + relationMetadata[i][j] = (Relation *) malloc (sizeof(Relation)); + if (!relationMetadata[i][j]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + relationMetadata[i][j][0].to = toId; + relationMetadata[i][j][0].from = i; + relationMetadata[i][j][0].freq = csRelBetweenMergeFreqSet[i].lstCnt[k]; + relationMetadata[i][j][0].percent = -1; + } else { + // realloc + relationMetadata[i][j] = (Relation *) realloc(relationMetadata[i][j], sizeof(Relation) * relationMetadataCount[i][j]); + if (!relationMetadata[i][j]) fprintf(stderr, ERROR: Couldn't realloc memory!\n); + relationMetadata[i][j][relationMetadataCount[i][j] - 1].to = toId; + relationMetadata[i][j][relationMetadataCount[i][j] - 1].from = i; + relationMetadata[i][j][relationMetadataCount[i][j] - 1].freq = csRelBetweenMergeFreqSet[i].lstCnt[k]; + relationMetadata[i][j][relationMetadataCount[i][j] - 1].percent = -1; + } + } + } + + // get total count of values + for (k = 0; k
MonetDB: rdf - fix memory leak in URI tokenization
Changeset: dacb05d87466 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=dacb05d87466 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: fix memory leak in URI tokenization diffs (45 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -107,7 +107,7 @@ ontology ontologies[] = { static void getPropNameShort(char** name, char* propStr) { char*token; - char*uri; + char*uri, *uriPtr; int length = 0; // number of tokens char**tokenizedUri = NULL; // list of tokens int i, j; @@ -117,6 +117,7 @@ void getPropNameShort(char** name, char* uri = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); if (!uri) fprintf(stderr, ERROR: Couldn't malloc memory!\n); strcpy(uri, propStr); // uri will be modified during tokenization + uriPtr = uri; // uri will be modified, uriPtr keeps original pointer token = strtok(uri, /#); while (token != NULL) { tokenizedUri = realloc(tokenizedUri, sizeof(char*) * ++length); @@ -124,6 +125,7 @@ void getPropNameShort(char** name, char* tokenizedUri[length - 1] = token; token = strtok(NULL, /#); } + free(uriPtr); // match with ontologies for (j = 0; j ontologyCount; ++j) { @@ -159,7 +161,7 @@ void getPropNameShort(char** name, char* // no matching ontology found, return content of last token - if (length == 1) { + if (length = 1) { // value (*name) = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); if (!(*name)) fprintf(stderr, ERROR: Couldn't malloc memory!\n); @@ -171,7 +173,6 @@ void getPropNameShort(char** name, char* } free(tokenizedUri); - free(uri); return; } #endif ___ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - Fix check (csId == -1) when using csIdFreqIdxMap
Changeset: 66f9493e49b6 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=66f9493e49b6 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: Fix check (csId == -1) when using csIdFreqIdxMap diffs (27 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -246,9 +246,10 @@ Relation*** initRelationMetadata(int** r relationMetadata = (Relation ***) malloc(sizeof(Relation **) * freqCSset-numCSadded); if (!relationMetadata) fprintf(stderr, ERROR: Couldn't malloc memory!\n); for (i = 0; i num; ++i) { // CS + CS cs; int csId = csIdFreqIdxMap[i]; - CS cs = (CS) freqCSset-items[csId]; if (csId == -1) continue; // ignore + cs = (CS) freqCSset-items[csId]; relationMetadata[csId] = (Relation **) malloc (sizeof(Relation *) * cs.numProp); if (!relationMetadata[csId]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); for (j = 0; j cs.numProp; ++j) { // propNo in CS order @@ -491,9 +492,10 @@ void createSQLMetadata(CSset* freqCSset, // set values for (i = 0; i num; ++i) { + CS cs; int csId = csIdFreqIdxMap[i]; - CS cs = (CS) freqCSset-items[csId]; if (csId == -1) continue; // ignore + cs = (CS) freqCSset-items[csId]; for (j = 0; j cs.numProp; ++j) { // propNo in CS order // check foreign key frequency ___ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - GDKfree strings that are allocated by takeOid()
Changeset: 008947889f2f for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=008947889f2f Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: GDKfree strings that are allocated by takeOid() diffs (283 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -312,27 +312,45 @@ IncidentFKs* initLinks(int csCount) { return links; } -/* Modifies the parameter! */ -/* from: URI/ or URI to: URI */ +/* from: URI/ or URI/ or URI or URI/ to: URI */ static -void removeBrackets(char** s) { - if (strlen(*s) 2) return; +str removeBrackets(char* s) { + str retStr; - if ((*s)[0] == '' (*s)[strlen(*s) - 2] == '' (*s)[strlen(*s) - 1] == '/') { + if (s[0] == '' s[strlen(s) - 2] == '' s[strlen(s) - 1] == '/') { // case URI/ - (*s)[strlen(*s) - 2] = '\0'; - (*s) += 1; - } else if ((*s)[0] == '' (*s)[strlen(*s) - 2] == '/' (*s)[strlen(*s) - 1] == '') { + retStr = (str) GDKmalloc(strlen(s) - 2); + if (!retStr) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strncpy(retStr, s + 1, strlen(s) - 3); + retStr[strlen(s) - 3] = '\0'; + return retStr; + } else if (s[0] == '' s[strlen(s) - 2] == '/' s[strlen(s) - 1] == '') { // case URI/ - (*s)[strlen(*s) - 2] = '\0'; - (*s) += 1; - } else if ((*s)[0] == '' (*s)[strlen(*s) - 1] == '') { + retStr = (str) GDKmalloc(strlen(s) - 2); + if (!retStr) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strncpy(retStr, s + 1, strlen(s) - 3); + retStr[strlen(s) - 3] = '\0'; + return retStr; + } else if (s[0] == '' s[strlen(s) - 1] == '') { // case URI - (*s)[strlen(*s) - 1] = '\0'; - (*s) += 1; - } else if ((*s)[strlen(*s) - 1] == '/') { + retStr = (str) GDKmalloc(strlen(s) - 1); + if (!retStr) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strncpy(retStr, s + 1, strlen(s) - 2); + retStr[strlen(s) - 2] = '\0'; + return retStr; + } else if (s[strlen(s) - 1] == '/') { // case URI/ - (*s)[strlen(*s) - 1] = '\0'; + retStr = (str) GDKmalloc(strlen(s)); + if (!retStr) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strncpy(retStr, s + 1, strlen(s) - 1); + retStr[strlen(s) - 1] = '\0'; + return retStr; + } else { + // copy + retStr = (str) GDKmalloc(strlen(s) + 1); + if (!retStr) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strcpy(retStr, s); + return retStr; } } @@ -567,8 +585,7 @@ void createTypeAttributesHistogram(BAT * BUN p, q; oid *sbt, *obt, *pbt; charobjType; - str propStr, objStr; - char*objStrPtr; + str propStr, objStr, tmpStr; char*start, *end; int length; @@ -620,9 +637,8 @@ void createTypeAttributesHistogram(BAT * if (objType == URI || objType == BLANKNODE) { objOid = objOid - ((oid)objType (sizeof(BUN)*8 - 4)); - takeOid(objOid, objStr); - removeBrackets(objStr); - objStrPtr = objStr; + takeOid(objOid, tmpStr); + objStr = removeBrackets(tmpStr); } else { objOid = objOid - (objType*2 + 1) * RDF_MIN_LITERAL; /* Get the real objOid from Map or Tokenizer */ bun = BUNfirst(mapbat); @@ -633,19 +649,15 @@ void createTypeAttributesHistogram(BAT * end = strrchr(objStr, ''); if (start != NULL end != NULL) { length = end - start; - objStrPtr = (char *) malloc(sizeof(char) * (length + 1)); - if (!objStrPtr) fprintf(stderr, ERROR: Couldn't malloc memory!\n); - memcpy(objStrPtr, start, length); - objStrPtr[length] = '\0'; - } else { - objStrPtr = objStr; +
MonetDB: rdf - Fix segfault on freeing lstObj[]
Changeset: 7eb1425edd90 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=7eb1425edd90 Modified Files: monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: Fix segfault on freeing lstObj[] diffs (23 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -987,11 +987,16 @@ void freeCSset(CSset *csSet){ int i; for(i = 0; i csSet-numCSadded; i ++){ free(csSet-items[i].lstProp); - #if STOREFULLCS + + } + + #if STOREFULLCS + for(i = 0; i csSet-numOrigFreqCS; i ++){ free(csSet-items[i].lstObj); - #endif - } + #endif + + free(csSet-items); free(csSet); } ___ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list
MonetDB: rdf - create labels for freqCS, not maxCS/mergeCS
Changeset: 72b6716bcfd7 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=72b6716bcfd7 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c Branch: rdf Log Message: create labels for freqCS, not maxCS/mergeCS diffs (truncated from 394 to 300 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -221,7 +221,6 @@ int** initRelationMetadataCount(CSset* f if (!relationMetadataCount) fprintf(stderr, ERROR: Couldn't malloc memory!\n); for (i = 0; i freqCSset-numCSadded; ++i) { relationMetadataCount[i] = NULL; - if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore relationMetadataCount[i] = (int *) malloc(sizeof(int) * freqCSset-items[i].numProp); if (!relationMetadataCount[i]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); for (j = 0; j freqCSset-items[i].numProp; ++j) { @@ -234,7 +233,7 @@ int** initRelationMetadataCount(CSset* f /* Calculate frequency per foreign key relationship. */ static -Relation*** initRelationMetadata(int** relationMetadataCount, CSmergeRel* csRelBetweenMergeFreqSet, CSset* freqCSset) { +Relation*** initRelationMetadata(int** relationMetadataCount, CSrel* csrelSet, int num, CSset* freqCSset, int* csIdFreqIdxMap) { int i, j, k; Relation*** relationMetadata; @@ -245,49 +244,51 @@ Relation*** initRelationMetadata(int** r relationMetadata = (Relation ***) malloc(sizeof(Relation **) * freqCSset-numCSadded); if (!relationMetadata) fprintf(stderr, ERROR: Couldn't malloc memory!\n); - for (i = 0; i freqCSset-numCSadded; ++i) { // CS - CS cs = (CS) freqCSset-items[i]; - if (cs.parentFreqIdx != -1) continue; // ignore - relationMetadata[i] = (Relation **) malloc (sizeof(Relation *) * cs.numProp); - if (!relationMetadata[i]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + for (i = 0; i num; ++i) { // CS + int csId = csIdFreqIdxMap[i]; + CS cs = (CS) freqCSset-items[csId]; + if (csId == -1) continue; // ignore + relationMetadata[csId] = (Relation **) malloc (sizeof(Relation *) * cs.numProp); + if (!relationMetadata[csId]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); for (j = 0; j cs.numProp; ++j) { // propNo in CS order int sum = 0; - relationMetadataCount[i][j] = 0; - relationMetadata[i][j] = NULL; - for (k = 0; k csRelBetweenMergeFreqSet[i].numRef; ++k) { // propNo in CSrel + relationMetadataCount[csId][j] = 0; + relationMetadata[csId][j] = NULL; + for (k = 0; k csrelSet[i].numRef; ++k) { // propNo in CSrel - if (csRelBetweenMergeFreqSet[i].lstPropId[k] == cs.lstProp[j]) { - int toId = csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k]; - relationMetadataCount[i][j] += 1; + if (csrelSet[i].lstPropId[k] == cs.lstProp[j]) { + int toId = csIdFreqIdxMap[ csrelSet[i].lstRefCSoid[k] ]; + if (toId == -1) continue; // ignore + relationMetadataCount[csId][j] += 1; // alloc/realloc - if (relationMetadataCount[i][j] == 1) { + if (relationMetadataCount[csId][j] == 1) { // alloc - relationMetadata[i][j] = (Relation *) malloc (sizeof(Relation)); - if (!relationMetadata[i][j]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); - relationMetadata[i][j][0].to = toId; - relationMetadata[i][j][0].from = i; - relationMetadata[i][j][0].freq = csRelBetweenMergeFreqSet[i].lstCnt[k]; - relationMetadata[i][j][0].percent = -1; + relationMetadata[csId][j] = (Relation *) malloc (sizeof(Relation)); + if (!relationMetadata[csId][j]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + relationMetadata[csId][j][0].to = toId; +
MonetDB: rdf - Add directory for ontology metadata
Changeset: c8bf33c699c9 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c8bf33c699c9 Added Files: monetdb5/extras/rdf/ontmetadata/loadOntologySAMPLE.sql monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia.csv monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia351.csv monetdb5/extras/rdf/ontmetadata/ontAttribute.gr.csv monetdb5/extras/rdf/ontmetadata/ontMetadata.dbpedia.csv monetdb5/extras/rdf/ontmetadata/ontMetadata.dbpedia351.csv monetdb5/extras/rdf/ontmetadata/ontMetadata.gr.csv Branch: rdf Log Message: Add directory for ontology metadata diffs (truncated from 35690 to 300 lines): diff --git a/monetdb5/extras/rdf/ontmetadata/loadOntologySAMPLE.sql b/monetdb5/extras/rdf/ontmetadata/loadOntologySAMPLE.sql new file mode 100644 --- /dev/null +++ b/monetdb5/extras/rdf/ontmetadata/loadOntologySAMPLE.sql @@ -0,0 +1,2 @@ +COPY NUMMETADATA RECORDS INTO ontmetadata FROM '/export/scratch2/linnea/scripts/loadOntology/ontMetadata.csv' USING DELIMITERS '|', '\n'; +COPY NUMATTRIBUTES RECORDS INTO ontattributes FROM '/export/scratch2/linnea/scripts/loadOntology/ontAttribute.csv' USING DELIMITERS '|', '\n'; diff --git a/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh b/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh new file mode 100755 --- /dev/null +++ b/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh @@ -0,0 +1,19 @@ +NUMMETADATA=`cat ontMetadata.dbpedia.csv | wc -l` +NUMATTRIBUTES=`cat ontAttribute.dbpedia.csv | wc -l` + +cp loadOntologySAMPLE.sql loadtmp.sql +sed -i s:NUMMETADATA:$NUMMETADATA:g loadtmp.sql +sed -i s:NUMATTRIBUTES:$NUMATTRIBUTES:g loadtmp.sql + +mclient -d dbpedia --port=5 loadtmp.sql + + + +NUMMETADATA=`cat ontMetadata.gr.csv | wc -l` +NUMATTRIBUTES=`cat ontAttribute.gr.csv | wc -l` + +cp loadOntologySAMPLE.sql loadtmp.sql +sed -i s:NUMMETADATA:$NUMMETADATA:g loadtmp.sql +sed -i s:NUMATTRIBUTES:$NUMATTRIBUTES:g loadtmp.sql + +mclient -d dbpedia --port=5 loadtmp.sql diff --git a/monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia.csv b/monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia.csv new file mode 100644 --- /dev/null +++ b/monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia.csv @@ -0,0 +1,15861 @@ +http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/academicDiscipline +http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/impactFactor +http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/impactFactorAsOf +http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/isPeerReviewed +http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/jstor +http://dbpedia.org/ontology/Game|http://dbpedia.org/ontology/equipment +http://dbpedia.org/ontology/Activity|http://dbpedia.org/ontology/equipment +http://dbpedia.org/ontology/Sport|http://dbpedia.org/ontology/equipment +http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/arielAward +http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/arielAward +http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/arielAward +http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/geminiAward +http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/geminiAward +http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/geminiAward +http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/goldenCalfAward +http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/goldenCalfAward +http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/goldenCalfAward +http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/goldenRaspberryAward +http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/goldenRaspberryAward +http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/goldenRaspberryAward +http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/iftaAward +http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/iftaAward +http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/iftaAward +http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/laurenceOlivierAward +http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/laurenceOlivierAward +http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/laurenceOlivierAward +http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/naacpImageAward +http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/naacpImageAward +http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/naacpImageAward +http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/nationalFilmAward +http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/nationalFilmAward +http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/nationalFilmAward
MonetDB: rdf - Store explicit metadata (tables and relationships)
Changeset: a536099d8d69 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a536099d8d69 Modified Files: monetdb5/extras/rdf/rdflabels.c Branch: rdf Log Message: Store explicit metadata (tables and relationships) Two tables are created to store information about relationships between tables and #tuples per table diffs (117 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -322,13 +322,14 @@ void escapeURI(char* s) { } /* Modifies the parameter! */ -/* Replaces colons, quotes, spaces, and dashes with underscores. */ +/* Replaces colons, quotes, spaces, and dashes with underscores. All lowercase. */ static void escapeURIforSQL(char* s) { int i; for (i = 0; i (int) strlen(s); ++i) { if (s[i] == ':' || s[i] == '' || s[i] == ' ' || s[i] == '-') s[i] = '_'; + s[i] = tolower(s[i]); } } @@ -364,7 +365,7 @@ void convertToSQL(CSset *freqCSset, Rela if ( freqCSset-items[i].parentFreqIdx != -1) continue; // ignore strcpy(temp, labels[i].name); escapeURIforSQL(temp); - fprintf(fout, CREATE TABLE %s_BUNFMT (\nsubject VARCHAR(10) PRIMARY KEY,\n, temp, freqCSset-items[i].csId); // TODO uppercase? underscores? + fprintf(fout, CREATE TABLE %s_BUNFMT (\nsubject VARCHAR(10) PRIMARY KEY,\n, temp, freqCSset-items[i].csId); // TODO underscores? for (j = 0; j labels[i].numProp; ++j) { char temp2[100]; strcpy(temp2, labels[i].lstProp[j]); @@ -411,6 +412,80 @@ void convertToSQL(CSset *freqCSset, Rela TKNZRclose(ret); } +static +void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet, Labels* labels) { + char**matrix = NULL; // matrix[from][to] + int i, j, k; + FILE*fout; + + // init + matrix = (char **) malloc(sizeof(char *) * freqCSset-numCSadded); + if (!matrix) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + + for (i = 0; i freqCSset-numCSadded; ++i) { + matrix[i] = (char *) malloc(sizeof(char *) * freqCSset-numCSadded); + if (!matrix) fprintf(stderr, ERROR: Couldn't realloc memory!\n); + + for (j = 0; j freqCSset-numCSadded; ++j) { + matrix[i][j] = 0; + } + } + + // set values + for (i = 0; i freqCSset-numCSadded; ++i) { + if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore + + for (j = 0; j freqCSset-items[i].numProp; ++j) { // propNo in CS order + // check foreign key frequency + int sum = 0; + for (k = 0; k csRelBetweenMergeFreqSet[i].numRef; ++k) { + if (csRelBetweenMergeFreqSet[i].lstPropId[k] == freqCSset-items[i].lstProp[j]) { + sum += csRelBetweenMergeFreqSet[i].lstCnt[k]; + } + } + + for (k = 0; k csRelBetweenMergeFreqSet[i].numRef; ++k) { // propNo in CSrel + if (csRelBetweenMergeFreqSet[i].lstPropId[k] == freqCSset-items[i].lstProp[j]) { + int to = csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k]; + if (i == to) continue; // ignore self references + if ((int) (100.0 * csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) FK_FREQ_THRESHOLD) continue; // foreign key is not frequent enough + matrix[i][to] = 1; + } + } + } + } + + // store matrix as csv + fout = fopen(adjacencyList.csv, wt); + for (i = 0; i freqCSset-numCSadded; ++i) { + for (j = 0; j freqCSset-numCSadded; ++j) { + if (matrix[i][j]) { + fprintf(fout, \%d\,\%d\\n,i,j); + } + } + } + fclose(fout); + + // print id - table name + fout = fopen(tableIdFreq.csv, wt); + for (i = 0; i freqCSset-numCSadded; ++i) { + char temp[100], temp2[100]; + if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore + strcpy(temp, labels[i].name); + escapeURIforSQL(temp); + sprintf(temp2, %s_BUNFMT, temp, freqCSset-items[i].csId); // TODO underscores? + fprintf(fout, \%d\,\%s\,\%d\\n, i, temp2, freqCSset-items[i].support); + } + fclose(fout); + + fout = fopen(CSmetadata.sql, wt); + fprintf(fout, CREATE TABLE table_id_freq (id VARCHAR(10), name VARCHAR(100), frequency
MonetDB: rdf - Improve memory footprint of labeling algorithm
Changeset: 2242dea64568 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2242dea64568 Modified Files: monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h Branch: rdf Log Message: Improve memory footprint of labeling algorithm diffs (truncated from 776 to 300 lines): diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -105,15 +105,17 @@ ontology ontologies[] = { #if USE_SHORT_NAMES /* Extracts the human-readable part of an URI (usually the last token). */ static -void getPropNameShort(char* name, char* propStr) { +void getPropNameShort(char** name, char* propStr) { char*token; - charuri[1000]; + char*uri; int length = 0; // number of tokens char**tokenizedUri = NULL; // list of tokens int i, j; int fit; // tokenize uri + uri = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); + if (!uri) fprintf(stderr, ERROR: Couldn't malloc memory!\n); strcpy(uri, propStr); // uri will be modified during tokenization token = strtok(uri, /#); while (token != NULL) { @@ -134,12 +136,20 @@ void getPropNameShort(char* name, char* } if (fit) { // found matching ontology, create label + int totalLength = 0; for (i = ontologies[j].length; i length; ++i) { - strcat(name, tokenizedUri[i]); - strcat(name, _); // if label consists of =2 tokens, use underscores + totalLength += (strlen(tokenizedUri[i]) + 1); // additional char for underscore + } + (*name) = (char *) malloc(sizeof(char) * (totalLength + 1)); + if (!(*name)) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strcpy(*name, \0); + + for (i = ontologies[j].length; i length; ++i) { + strcat(*name, tokenizedUri[i]); + strcat(*name, _); // if label consists of =2 tokens, use underscores } // remove trailing underscore - name[strlen(name) - 1] = '\0'; + (*name)[strlen(*name) - 1] = '\0'; free(tokenizedUri); return; @@ -151,12 +161,17 @@ void getPropNameShort(char* name, char* if (length == 1) { // value - strcat(name, propStr); + (*name) = (char *) malloc(sizeof(char) * (strlen(propStr) + 1)); + if (!(*name)) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strcpy(*name, propStr); } else { - strcat(name, tokenizedUri[length - 1]); + (*name) = (char *) malloc(sizeof(char) * (strlen(tokenizedUri[length - 1]) + 1)); + if (!(*name)) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + strcpy(*name, tokenizedUri[length - 1]); } free(tokenizedUri); + free(uri); return; } #endif @@ -180,8 +195,8 @@ int** initTypeAttributesHistogramCount(i } static -TypeAttributesFreq*** initTypeAttributesHistogram(int typeAttributesCount, int ** typeAttributesHistogramCount, int num) { - int i, j, k; +TypeAttributesFreq*** initTypeAttributesHistogram(int typeAttributesCount, int num) { + int i, j; TypeAttributesFreq*** typeAttributesHistogram; typeAttributesHistogram = (TypeAttributesFreq ***) malloc(sizeof(TypeAttributesFreq **) * num); @@ -190,12 +205,7 @@ TypeAttributesFreq*** initTypeAttributes typeAttributesHistogram[i] = (TypeAttributesFreq **) malloc (sizeof(TypeAttributesFreq *) * typeAttributesCount); if (!typeAttributesHistogram[i]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); for (j = 0; j typeAttributesCount; ++j) { - typeAttributesHistogram[i][j] = (TypeAttributesFreq *) malloc (sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[i][j]); - if (!typeAttributesHistogram[i][j]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); - for (k = 0; k typeAttributesHistogramCount[i][j]; ++k) { - typeAttributesHistogram[i][j][k].freq = 0; - typeAttributesHistogram[i][j][k].percent = 0; - } +
MonetDB: rdf - SQL procedure to create a subschema
Changeset: cbde82c8ce68 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cbde82c8ce68 Modified Files: monetdb5/extras/rdf/rdfretrieval.c monetdb5/extras/rdf/rdfretrieval.h monetdb5/extras/rdf/rdfschema.c sql/backends/monet5/sql.mx sql/scripts/30_rdf.sql Branch: rdf Log Message: SQL procedure to create a subschema diffs (truncated from 727 to 300 lines): diff --git a/monetdb5/extras/rdf/rdfretrieval.c b/monetdb5/extras/rdf/rdfretrieval.c --- a/monetdb5/extras/rdf/rdfretrieval.c +++ b/monetdb5/extras/rdf/rdfretrieval.c @@ -24,65 +24,25 @@ #include rdflabels.h static -char** initAdjacencyMatrix(int csCount) { - char**matrix = NULL; // matrix[from][to] - int i, j; - - matrix = (char **) malloc(sizeof(char *) * csCount); - if (!matrix) fprintf(stderr, ERROR: Couldn't malloc memory!\n); - - for (i = 0; i csCount; ++i) { - matrix[i] = (char *) malloc(sizeof(char *) * csCount); - if (!matrix) fprintf(stderr, ERROR: Couldn't realloc memory!\n); - - for (j = 0; j csCount; ++j) { - matrix[i][j] = 0; - } +int edgeExists(long int from, long int to, long int* adjacency_from, long int* adjacency_to, int adjacencyCount) { + int i; + for (i = 0; i adjacencyCount; ++i) { + if (adjacency_from[i] == from adjacency_to[i] == to) return 1; } - - return matrix; + return 0; } static -void createAdjacencyMatrix(char** matrix, CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet) { - int i, j, k; - - for (i = 0; i freqCSset-numCSadded; ++i) { - if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore - - for (j = 0; j freqCSset-items[i].numProp; ++j) { // propNo in CS order - // check foreign key frequency - int sum = 0; - for (k = 0; k csRelBetweenMergeFreqSet[i].numRef; ++k) { - if (csRelBetweenMergeFreqSet[i].lstPropId[k] == freqCSset-items[i].lstProp[j]) { - sum += csRelBetweenMergeFreqSet[i].lstCnt[k]; - } - } - - for (k = 0; k csRelBetweenMergeFreqSet[i].numRef; ++k) { // propNo in CSrel - if (csRelBetweenMergeFreqSet[i].lstPropId[k] == freqCSset-items[i].lstProp[j]) { - int to = csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k]; - if (i == to) continue; // ignore self references - if ((int) (100.0 * csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) FK_FREQ_THRESHOLD) continue; // foreign key is not frequent enough - matrix[i][to] = 1; - } - } - } - } -} - -static -NodeStat* initNodeStats(CSset* freqCSset) { +NodeStat* initNodeStats1(long int* table_freq, int tableCount) { NodeStat* nodeStats = NULL; int i; - nodeStats = (NodeStat *) malloc(sizeof(NodeStat) * freqCSset-numCSadded); + nodeStats = (NodeStat *) malloc(sizeof(NodeStat) * tableCount); if (!nodeStats) fprintf(stderr, ERROR: Couldn't malloc memory!\n); - for (i = 0; i freqCSset-numCSadded; ++i) { - if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore - nodeStats[i].origWeight = freqCSset-items[i].support; - nodeStats[i].weight = freqCSset-items[i].support; // weight = origWeight + for (i = 0; i tableCount; ++i) { + nodeStats[i].origWeight = table_freq[i]; + nodeStats[i].weight = table_freq[i]; // weight = origWeight nodeStats[i].steps = -1; nodeStats[i].predecessor = -1; } @@ -91,30 +51,11 @@ NodeStat* initNodeStats(CSset* freqCSset } static -NodeStat* initNodeStats23(CSset* freqCSset) { - NodeStat* nodeStats = NULL; - int i; - - nodeStats = (NodeStat *) malloc(sizeof(NodeStat) * freqCSset-numCSadded); - if (!nodeStats) fprintf(stderr, ERROR: Couldn't malloc memory!\n); - - for (i = 0; i freqCSset-numCSadded; ++i) { - if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore - nodeStats[i].origWeight = freqCSset-items[i].support; - nodeStats[i].weight = 0; - nodeStats[i].steps = -1; // not used - nodeStats[i].predecessor = 0; // not used - } - - return nodeStats; -} - -static -void bfs1(int root, CSset* freqCSset, char** adjacencyMatrix, int* queue, int* visited, int* isInQueue, int* queuePosition, int* queueLength, NodeStat* nodeStats) { +void bfs1(int root, long
MonetDB: rdf - Schema overview: first version of algorithm to ch...
Changeset: 9a9a115446e0 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9a9a115446e0 Modified Files: monetdb5/extras/rdf/rdfretrieval.c monetdb5/extras/rdf/rdfretrieval.h Branch: rdf Log Message: Schema overview: first version of algorithm to choose tables that provide an overview of the SQL schema diffs (289 lines): diff --git a/monetdb5/extras/rdf/rdfretrieval.c b/monetdb5/extras/rdf/rdfretrieval.c --- a/monetdb5/extras/rdf/rdfretrieval.c +++ b/monetdb5/extras/rdf/rdfretrieval.c @@ -553,8 +553,257 @@ int* retrieval4(int root, int numNodesMa return chosenNodes; } +static +char** initEdgesOverview(long int* table_id, int tableCount, long int* adjacency_from, long int* adjacency_to, int adjacencyCount) { + char**edges; + int i, j; + + edges = (char **) malloc(sizeof(char *) * tableCount); + if (!edges) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + + for (i = 0; i tableCount; ++i) { + edges[i] = (char *) malloc(sizeof(char) * tableCount); + if (!edges[i]) fprintf(stderr, ERROR: Couldn't malloc memory!\n); + for (j = 0; j tableCount; ++j) { + edges[i][j] = 0; + } + edges[i][i] = 1; // self-reachability + } + + for (i = 0; i adjacencyCount; ++i) { + long int from = adjacency_from[i]; + long int to = adjacency_to[i]; + int fromIdx = -1; + int toIdx = -1; + + // index lookup + for (j = 0; j tableCount; ++j) { + if (table_id[j] == from) {fromIdx = j;} + if (table_id[j] == to) {toIdx = j;} + if (fromIdx -1 toIdx -1) {break;} + } + assert(fromIdx -1); + assert(toIdx -1); + + // set edge + edges[fromIdx][toIdx] = 1; + } + + return edges; +} + +static +int compareOverviewNodes (const void * a, const void * b) { + return ( (*(Node*)b).reachabilityCount - (*(Node*)a).reachabilityCount ); // sort descending +} + +static +int* retrievalOverview(int* numNodesActual, long int* table_id, str* table_name, long int* table_freq, int tableCount, long int* adjacency_from, long int* adjacency_to, int adjacencyCount) { + int i, j, k; + char**edges; + int sumSubjects = 0; + int csCount = 0; + int sumChosenSubjects = 0; + + int queue[tableCount]; // cyclic array + int isInQueue[tableCount]; + int queuePosition; // next element in queue to view at + int queueLength; + charvisited[tableCount]; + int subgraphSize; + Groups groups; + int *chosenNodes = NULL; + + groups.count = 0; + groups.groups = NULL; + + edges = initEdgesOverview(table_id, tableCount, adjacency_from, adjacency_to, adjacencyCount); + + for (i = 0; i tableCount; ++i) { + visited[i] = 0; + } + + // split into disconnected subgraph (ignoring the direction of the edges) using BFS + while (1) { + int root = -1; + for (i = 0; i tableCount; ++i) { + if (!visited[i]) { + root = i; + break; + } + } + if (root == -1) break; // all nodes have been visited, all subgraphs have been found + // init + subgraphSize = 0; + + for (i = 0; i tableCount; ++i) { + queue[i] = -1; + isInQueue[i] = 0; + } + + // add root node + queue[0] = root; + queuePosition = 0; + queueLength = 1; + + visited[root] = 1; + isInQueue[root] = 1; + + // bfs + while (queueLength 0) { + // dequeue next value + int node = queue[queuePosition % tableCount]; + visited[node] = 1; + subgraphSize++; + isInQueue[node] = 0; + queuePosition += 1; + queueLength -= 1; + + // for all adjacent edges + for (i = 0; i tableCount; ++i) { + if (visited[i] || isInQueue[i]) continue; + if (edges[node][i] || edges[i][node]) { + // ignore direction of edge + + // enqueue + queue[((queueLength + queuePosition) % tableCount)] = i; +