from:"Linnea Passing"

MonetDB: rdf - add support for not using labels in merging phase...

2014-05-07 Thread Linnea Passing

Changeset: 480cd88defb8 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=480cd88defb8
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

add support for not using labels in merging phase (not enabled)


diffs (140 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -3585,6 +3585,7 @@ void generatecsRelSum(CSrel csRel, int f
 
 }
 
+#if USE_LABEL_FOR_MERGING
 static
 LabelStat* initLabelStat(void){
LabelStat *labelStat = (LabelStat*) malloc(sizeof(LabelStat)); 
@@ -3603,10 +3604,12 @@ LabelStat* initLabelStat(void){
 
return labelStat; 
 }
+#endif
 
 /*
  * 
  * */
+#if USE_LABEL_FOR_MERGING
 #if USE_ALTERNATIVE_NAME 
 static
 oid getMostSuitableName(CSlabel *labels, int freqIdx, int candIdx){
@@ -3641,6 +3644,7 @@ oid getMostSuitableName(CSlabel *labels,
 
 }
 #endif
+#endif
 
 #if DETECT_INCORRECT_TYPE_SUBJECT
 
@@ -3801,6 +3805,7 @@ void buildLabelStatForFinalMergeCS(Label
 
 #endif
 
+#if USE_LABEL_FOR_MERGING
 static
 void buildLabelStat(LabelStat *labelStat, CSlabel *labels, CSset *freqCSset, 
int k){
int i,j; 
@@ -3887,7 +3892,9 @@ void buildLabelStat(LabelStat *labelStat
}
 
 }
-
+#endif
+
+#if USE_LABEL_FOR_MERGING
 static 
 void freeLabelStat(LabelStat *labelStat){
int i; 
@@ -3901,6 +3908,7 @@ void freeLabelStat(LabelStat *labelStat)
BBPreclaim(labelStat-labelBat);
free(labelStat);
 }
+#endif
 
 static 
 void doMerge(CSset *freqCSset, int ruleNum, int freqId1, int freqId2, oid 
*mergecsId, CSlabel** labels, oid** ontmetadata, int ontmetadataCount, oid 
name, int isType, int isOntology, int isFK){
@@ -3954,6 +3962,7 @@ void doMerge(CSset *freqCSset, int ruleN
 
 }
 
+#if USE_LABEL_FOR_MERGING
 static
 str mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId, 
oid** ontmetadata, int ontmetadataCount,bat *mapbatid){
int i, j; 
@@ -4188,6 +4197,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
 
return MAL_SUCCEED; 
 }
+#endif
 
 static
 void mergeMaxFreqCSByS5(CSrel *csrelMergeFreqSet, CSset *freqCSset, CSlabel** 
labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId, oid** 
ontmetadata, int ontmetadataCount){
@@ -4319,7 +4329,7 @@ void mergeMaxFreqCSByS5(CSrel *csrelMerg
 }
 
 
-
+#if USE_LABEL_FOR_MERGING
 static
 char isSemanticSimilar(int freqId1, int freqId2, CSlabel* labels, 
OntoUsageNode *tree, int numOrigFreqCS, oid *ancestor, BAT *ontmetaBat, 
OntClass *ontclassSet){  /*Rule S1 S2 S3*/
int i, j; 
@@ -4433,6 +4443,7 @@ char isSemanticSimilar(int freqId1, int 
 
return 0;
 }
+#endif
 
 static
 void initTFIDFInfos(TFIDFInfo *tfidfInfos, int curNumMergeCS, oid* 
mergeCSFreqCSMap, CSset *freqCSset, PropStat *propStat){
@@ -4476,6 +4487,7 @@ void freeTFIDFInfo(TFIDFInfo *tfidfInfos
free(tfidfInfos);
 }
 
+#if USE_LABEL_FOR_MERGING
 static
 void mergeCSByS2(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, 
int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid 
**ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet){
int i, j; 
@@ -4517,6 +4529,7 @@ void mergeCSByS2(CSset *freqCSset, CSlab
}
 
 }
+#endif
 
 static
 void mergeCSByS4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap, 
int curNumMergeCS, oid *mergecsId,oid **ontmetadata, int ontmetadataCount){
@@ -9090,6 +9103,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
curNumMergeCS = countNumberMergeCS(freqCSset);
printf(Before using rules: Number of freqCS is: %d \n,curNumMergeCS);

+#if USE_LABEL_FOR_MERGING
/* -- S1 --- */
mergecsId = *maxCSoid + 1; 
 
@@ -9109,6 +9123,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
computeMetricsQ(freqCSset);
#endif
tmpLastT = curT;
+#endif

/* -- S3 --- */
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
@@ -9159,6 +9174,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
tmpLastT = curT;

+#if USE_LABEL_FOR_MERGING
//S2: Common ancestor
free(mergeCSFreqCSMap);
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
@@ -9179,6 +9195,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
#endif
 
tmpLastT = curT;
+#endif
 
 
//S4: TF/IDF similarity
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - implement USE_LABEL_FINDING_MAXCS (but do not ena...

2014-05-05 Thread Linnea Passing

Changeset: ab84eb43b2d9 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ab84eb43b2d9
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

implement USE_LABEL_FINDING_MAXCS (but do not enable it)


diffs (175 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2970,12 +2970,68 @@ void updateParentIdxAll(CSset *freqCSset
}
 }
 
+#if USE_LABEL_FINDING_MAXCS
+/*
+ *  * Return 1 if there is semantic evidence against merging the two CS's, 
this is the case iff the two CS's have a hierarchy and their common ancestor is 
too generic (support above IMPORTANCE_THRESHOLD).
+ *   */
+static
+char isEvidenceAgainstMerging(int freqId1, int freqId2, CSlabel* labels, 
OntoUsageNode *tree) {
+   int i, j;
+   int level;
+   OntoUsageNode *tmpNode;
+
+   // Get common ancestor
+   int hCount1 = labels[freqId1].hierarchyCount;
+   int hCount2 = labels[freqId2].hierarchyCount;
+   int minCount = (hCount1  hCount2)?hCount2:hCount1;
+
+   if (minCount == 0) {
+   // at least one CS does not have a hierarchy -- no semantic 
information -- no semantic evidence against merging
+   return 0;
+   }
+
+   // get level where the hierarchies differ
+   for (i = 0; i  minCount; i++){
+   if (labels[freqId1].hierarchy[hCount1-1-i] != 
labels[freqId2].hierarchy[hCount2-1-i]) break;
+   }
+
+   if (i == 0) {
+   // not even the top level of the hierarchy is the same -- 
there is semantic evidence against merging the two CS's
+   return 1;
+   } else if (i == minCount) {
+   // same name -- no semantic evidence against merging
+   return 0;
+   }
+
+   // get the common ancestor at level i
+   level = 0;
+   tmpNode = tree;
+   while(level  i){
+   for (j = 0; j  tmpNode-numChildren; j++) {
+   if (tmpNode-lstChildren[j]-uri == 
labels[freqId1].hierarchy[hCount1-1-level]){
+   tmpNode = tmpNode-lstChildren[j];
+   break;
+   }
+   }
+   level++;
+   }
+
+   if (tmpNode-percentage = IMPORTANCE_THRESHOLD) {
+   // have common ancestor but it is too generic -- there is 
semantic evidence against merging the two CS's
+   return 1;
+   } else {
+   // common ancestor is specific -- no semantic evidence against 
merging
+   return 0;
+   }
+}
+#endif
+
 /*
  * Get the maximum frequent CSs from a CSset
  * Here maximum frequent CS is a CS that there exist no other CS which 
contains that CS
  * */
 static 
-void mergeCSbyS3(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap, 
int curNumMergeCS, oid **ontmetadata, int ontmetadataCount){
+void mergeCSbyS3(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap, 
int curNumMergeCS, oid **ontmetadata, int ontmetadataCount, OntoUsageNode 
*tree){
 
int numMergeCS = curNumMergeCS; 
int i, j; 
@@ -2983,13 +3039,12 @@ void mergeCSbyS3(CSset *freqCSset, CSlab
 
int tmpParentIdx; 
int freqId1, freqId2; 
-   #if USE_LABEL_FINDING_MAXCS
-   charisLabelComparable = 0;
-   #endif
-   charisDiffLabel = 0;
int numP1, numP2; 
CS  *mergecs1, *mergecs2; 
-   (void) labels;
+
+#if !USE_LABEL_FINDING_MAXCS
+   (void) tree;
+#endif
 
printf(Retrieving maximum frequent CSs: \n);
 
@@ -3000,44 +3055,35 @@ void mergeCSbyS3(CSset *freqCSset, CSlab
if (freqCSset-items[freqId1].type == DIMENSIONCS) continue; 
#endif
 
-   #if USE_LABEL_FINDING_MAXCS
-   isLabelComparable = 0;
-   if ((*labels)[i].name != BUN_NONE) isLabelComparable = 1; // no 
DUMMY
-   #endif
-
for (j = (i+1); j  numMergeCS; j++){
freqId2 = mergeCSFreqCSMap[j];
#if NOT_MERGE_DIMENSIONCS
if (freqCSset-items[freqId2].type == DIMENSIONCS) 
continue; 
#endif
 
-   isDiffLabel = 0; 
-   #if USE_LABEL_FINDING_MAXCS
-   if (isLabelComparable == 0 || 
strcmp((*labels)[freqId1].name, (*labels)[freqId2].name) != 0) {
-   isDiffLabel = 1; 
-   }
-   #endif
-
-   if (isDiffLabel == 0){
-   numP2 = freqCSset-items[freqId2].numProp;
-   numP1 = freqCSset-items[freqId1].numProp;
-   if (numP2  numP1  (numP2-numP1) 
MAX_SUB_SUPER_NUMPROP_DIF){
-   if

MonetDB: rdf - add support for printing ontology tree

2014-05-05 Thread Linnea Passing

Changeset: 121f9fd5d239 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=121f9fd5d239
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

add support for printing ontology tree


diffs (65 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2621,18 +2621,45 @@ void addToOntoUsageTree(OntoUsageNode* t
 }
 
 static
-void printTree(OntoUsageNode* tree, int level) {
+void printTreePrivate(OntoUsageNode* tree, int level, FILE* fout) {
int i;
-   str uriStr;
-
-   takeOid(tree-uri, uriStr);
-   printf(Level %d URI %s Count %d Sum %d Percent %.1f\n, level, uriStr, 
tree-numOccurances, tree-numOccurancesSum, tree-percentage * 100);
-
+   str uriStr, uriStrShort;
+
+   if (tree-parent) {
+   takeOid(tree-uri, uriStr);
+   getPropNameShort(uriStrShort, uriStr);
+   fprintf(fout, BUNFMT [label = \%s (%.1f%%)\];\n, tree-uri, 
uriStrShort, tree-percentage * 100);
+   fprintf(fout, BUNFMT--BUNFMT;\n, tree-uri, 
tree-parent-uri);
+   GDKfree(uriStrShort);
+   GDKfree(uriStr);
+   } else {
+   // artifical root, has no name
+   fprintf(fout, BUNFMT [label = \ROOT (%.1f%%)\];\n, 
tree-uri, tree-percentage * 100);
+   }
for (i = 0; i  tree-numChildren; ++i) {
-   printTree(tree-lstChildren[i], level+1);
+   printTreePrivate(tree-lstChildren[i], level+1, fout);
}
 }
 
+/*
+ * Print ontology tree to file, dot code
+ */
+static
+void printTree(OntoUsageNode* tree) {
+   FILE *fout = fopen(ontoUsageTree.txt, wt);
+
+   // header
+   fprintf(fout, graph g {\n);
+   fprintf(fout, graph [ratio = \compress\, rankdir = \RL\];\n);
+   fprintf(fout, node [shape = \box\];\n\n);
+   // body
+   printTreePrivate(tree, 0, fout);
+   // footer
+
+   fprintf(fout, }\n);
+   fclose(fout);
+}
+
 static
 void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, oid** 
ontmetadata, int ontmetadataCount, BAT *ontmetaBat,CSlabel* labels) {
int i;
@@ -2679,7 +2706,7 @@ void createOntoUsageTree(OntoUsageNode**
// print
if(0){
printf(Ontology tree:\n);
-   printTree(*tree, 0);
+   printTree(*tree);
}
 }
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - update layout of sampleDataFullRandom file

2014-04-29 Thread Linnea Passing

Changeset: 5f95f68cf48b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=5f95f68cf48b
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

update layout of sampleDataFullRandom file


diffs (54 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6848,12 +6848,13 @@ str printFullSampleData(CSSampleExtend *
fouttb = fopen(filename2,wt);
foutis = fopen(filename3,wt);
 
+   fprintf(foutrand, Table|Name|Rating\n);
for (i = 0; i  num; i++){
sample = csSampleEx[i];
if ((int)sample.candidateCount == 1  sample.candidates[0] == 
BUN_NONE) continue; // do not print tables withoud candidates
freqCS = freqCSset-items[sample.freqIdx];
fprintf(fout,Table %d, %d tuples\n, i, freqCS.support);
-   fprintf(foutrand,Table %d, %d tuples\n, i, freqCS.support);
+   fprintf(foutrand,Table %d, %d tuples, i, freqCS.support);
fprintf(foutsol, Table %d\n, i);
for (j = 0; j  (int)sample.candidateCount; j++){
//fprintf(fout,BUNFMT,sample.candidates[j]);
@@ -6865,12 +6866,10 @@ str printFullSampleData(CSSampleExtend *
getStringName(sample.candidates[j], canStr, 
mapi, mbat, 1);
 #if USE_SHORT_NAMES
getPropNameShort(canStrShort, canStr);
-   if (j+1 == (int)sample.candidateCount) 
fprintf(foutrand, %s,  canStrShort);
-   else fprintf(foutrand, %s|, canStrShort);
+   fprintf(foutrand, |%s\n,  canStrShort);
GDKfree(canStrShort);
 #else
-   if (j+1 == (int)sample.candidateCount) 
fprintf(foutrand, %s,  canStr);
-   else fprintf(foutrand, %s|, canStr);
+   fprintf(foutrand, %s,  canStr);
 
 #endif
GDKfree(canStr); 
@@ -6896,11 +6895,10 @@ str printFullSampleData(CSSampleExtend *

}
}
-   fprintf(foutrand, \n);
fprintf(foutsol, \n);
 
// print origin of candidates for solutions file
-   fprintf(foutsol, New: %d, Type %d, Ontology %d, FK %d\n, 
sample.candidatesNew, sample.candidatesType, sample.candidatesOntology, 
sample.candidatesFK);
+   fprintf(foutsol, New %d, Type %d, Ontology %d, FK %d\n, 
sample.candidatesNew, sample.candidatesType, sample.candidatesOntology, 
sample.candidatesFK);

if (sample.name != BUN_NONE){
str canStrShort = NULL;
@@ -7136,7 +7134,6 @@ str printFullSampleData(CSSampleExtend *
 
fprintf(fout, \n);
fprintf(foutsol, \n);
-   fprintf(foutrand, \n);
fprintf(foutis, \  tmp.txt \n \n);
 
if (sample.name != BUN_NONE){
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - fix bug in printing referenced table names

2014-04-29 Thread Linnea Passing

Changeset: cfb55d248d82 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cfb55d248d82
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

fix bug in printing referenced table names

instead of the name of the referenced table, the name of the current table was 
printed


diffs (26 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6582,12 +6582,11 @@ str printSampleData(CSSample *csSample, 
 #if NO_OUTPUTFILE == 0
 static
 void printPropertyWithMarkers(FILE *fout, str propStr, CSSampleExtend 
*csSampleEx, CSPropTypes *csPropTypes, int tblId, int propId, BATiter mapi, BAT 
*mbat) {
-   CSSampleExtend  sample = csSampleEx[tblId];
// print property string
fprintf(fout, %s, propStr);
 
// add star (*) if multi-valued
-   if (sample.lstIsMVCol[propId]) {
+   if (csSampleEx[tblId].lstIsMVCol[propId]) {
fprintf(fout, *);
}
 
@@ -6599,7 +6598,7 @@ void printPropertyWithMarkers(FILE *fout
 #if USE_SHORT_NAMES
str nameStrShort;
 #endif
-   getStringName(sample.candidatesOrdered[0], nameStr, 
mapi, mbat, 1);
+   
getStringName(csSampleEx[refTblId].candidatesOrdered[0], nameStr, mapi, mbat, 
1);
 #if USE_SHORT_NAMES
getPropNameShort(nameStrShort, nameStr);
fprintf(fout, -%s, nameStrShort);
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - remove language tags and quotation marks from mul...

2014-04-28 Thread Linnea Passing

Changeset: 22a192111e74 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=22a192111e74
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

remove language tags and quotation marks from multi-valued properties of type 
datetime and string


diffs (34 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -5514,9 +5514,12 @@ int getObjValueFromMVBat(ValPtr returnVa
//printf(A String object value: %s \n,objStr);
tmpStr = BUNtail(tmpi, pos); 
if (strcmp(tmpStr,str_nil) != 0){
-   inputStr = GDKmalloc(sizeof(char) * 
strlen(tmpStr) + 1); 
-   memcpy(inputStr, tmpStr, sizeof(char) * 
strlen(tmpStr) + 1);
-
+   // remove quotes and language tags
+   str tmpStrShort;
+   getStringBetweenQuotes(tmpStrShort, tmpStr);
+   inputStr = GDKmalloc(sizeof(char) * 
strlen(tmpStrShort) + 1); 
+   memcpy(inputStr, tmpStrShort, sizeof(char) * 
strlen(tmpStrShort) + 1);
+   GDKfree(tmpStrShort);
VALset(returnValue, TYPE_str, inputStr);
if (rdfcast(objType, STRING, returnValue, 
castedValue) != 1){
printf(Everything should be able to 
cast to String \n);
@@ -5532,8 +5535,12 @@ int getObjValueFromMVBat(ValPtr returnVa
//printf(A Datetime object value: %s \n,objStr);
tmpStr = BUNtail(tmpi, pos);
if (strcmp(tmpStr,str_nil) != 0){
-   inputStr = GDKmalloc(sizeof(char) * 
strlen(tmpStr) + 1);
-   memcpy(inputStr, tmpStr, sizeof(char) * 
strlen(tmpStr) + 1);
+   // remove quotes and language tags
+   str tmpStrShort;
+   getStringBetweenQuotes(tmpStrShort, tmpStr);
+   inputStr = GDKmalloc(sizeof(char) * 
strlen(tmpStrShort) + 1);
+   memcpy(inputStr, tmpStrShort, sizeof(char) * 
strlen(tmpStrShort) + 1);
+   GDKfree(tmpStrShort);
VALset(returnValue, TYPE_str, inputStr);
if (rdfcast(objType, STRING, returnValue, 
castedValue) != 1){
printf(Everything should be able to 
cast to String \n);
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - if type properties are available, add them to the...

2014-04-28 Thread Linnea Passing

Changeset: f638ef061c44 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f638ef061c44
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

if type properties are available, add them to the survey data in any case


diffs (44 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6381,7 +6381,7 @@ void printPropertyWithMarkers(FILE *fout
 #if NO_OUTPUTFILE == 0
 // Compute property order and number of properties that are printed, and the 
list of remaining properties that is printed without sample data
 static
-int* createPropertyOrder(int *numPropsInSampleTable, int 
**remainingProperties, CSset *freqCSset, CSSampleExtend *csSampleEx, int tblId, 
CSPropTypes *csPropTypes, PropStat *propStat) {
+int* createPropertyOrder(int *numPropsInSampleTable, int 
**remainingProperties, CSset *freqCSset, CSSampleExtend *csSampleEx, int tblId, 
CSPropTypes *csPropTypes, PropStat *propStat, char* isTypeProp) {
int i;
CSSampleExtend  sample;
CSPropTypes csPropType;
@@ -6466,11 +6466,21 @@ int* createPropertyOrder(int *numPropsIn
}
 
// now add properties to propOrder array
+   // add all type properties
+   for (i = 0; i  sample.numProp; ++i) {
+   if (propsAdded = (*numPropsInSampleTable)) break; // enough 
properties found
+   if (isTypeProp[i]) { // do not use 'index' because the 
isTypeProp array uses the old order of properties
+   propOrder[propsAdded] = i;
+   isAdded[i] = 1;
+   propsAdded++;
+   }
+   }
+
// first round: properties with isFilled=1 and isTextDate=1, ordered by 
tfidfValues descending
for (i = 0; i  sample.numProp; ++i) {
int index = propOrderTfidf[i];
if (propsAdded = (*numPropsInSampleTable)) break; // enough 
properties found
-   if (isFilled[index]  isTextDate[index]) {
+   if (isFilled[index]  isTextDate[index]  !isAdded[index]) {
// add
propOrder[propsAdded] = index;
isAdded[index] = 1;
@@ -6702,7 +6712,7 @@ str printFullSampleData(CSSampleExtend *
// order properties and get list of remaining properties that 
will be printed without sample data
remainingProperties = NULL;
numPropsInSampleTable = 0;
-   propOrder = createPropertyOrder(numPropsInSampleTable, 
remainingProperties, freqCSset, csSampleEx, i, csPropTypes, propStat);
+   propOrder = createPropertyOrder(numPropsInSampleTable, 
remainingProperties, freqCSset, csSampleEx, i, csPropTypes, propStat, 
isTypeProp);
 
// print list of columns that did not make it to propOrder and 
are therefore printed without sample data
if (sample.numProp  numPropsInSampleTable) {
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - ensure same height of tables, necessary for layou...

2014-04-28 Thread Linnea Passing

Changeset: acc2cfbc3ba4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=acc2cfbc3ba4
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

ensure same height of tables, necessary for layouting


diffs (13 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6736,6 +6736,9 @@ str printFullSampleData(CSSampleExtend *
GDKfree(propStr);
}
fprintf(fout, \n);
+   } else {
+   // we have to print an empty row to ensure that all 
tables have the same height, this simplifies the survey layouting in a 
spreadsheet programm
+   fprintf(fout, \n);
}
 
//List of columns
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - fix scope

2014-04-25 Thread Linnea Passing

Changeset: ad60eed4d99a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ad60eed4d99a
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

fix scope


diffs (19 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -9192,6 +9192,7 @@ RDFreorganize(int *ret, CStableStat *cst
printf(Return value from RDFdistTriplesToCSs is %s \n, returnStr);
if (returnStr != MAL_SUCCEED){
throw(RDF, rdf.RDFreorganize, Problem in distributing 
triples to BATs using CSs);   
+   }

curT = clock(); 
printf (RDFdistTriplesToCSs process took  %f seconds.\n, 
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
@@ -9200,7 +9201,6 @@ RDFreorganize(int *ret, CStableStat *cst
#if NO_OUTPUTFILE == 0
printFKMultiplicityFromCSPropTypes(csPropTypes, numTables, freqCSset, 
*freqThreshold);
#endif
-   }

#if NO_OUTPUTFILE == 0
{
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - set name origin (isType, isOntology, isFK) when u...

2014-04-25 Thread Linnea Passing

Changeset: d201cd7814d2 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d201cd7814d2
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

set name origin (isType, isOntology, isFK) when updating labels


diffs (285 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2928,7 +2928,7 @@ oid* mergeCandidates(int *candidatesCoun
  * If no MERGECS is created (subset-superset relation), mergeCSFreqId contains 
the Id of the superset class.
  * For S1 and S2, parameter 'name' is used to avoid recomputation of CS names
  */
-str updateLabel(int ruleNumber, CSset *freqCSset, CSlabel **labels, int newCS, 
int mergeCSFreqId, int freqCS1, int freqCS2, oid name, oid **ontmetadata, int 
ontmetadataCount, int *lstFreqId, int numIds){
+str updateLabel(int ruleNumber, CSset *freqCSset, CSlabel **labels, int newCS, 
int mergeCSFreqId, int freqCS1, int freqCS2, oid name, int isType, int 
isOntology, int isFK, oid **ontmetadata, int ontmetadataCount, int *lstFreqId, 
int numIds){
int i;
int freqCS1Counter;
CSlabel big, small;
@@ -2945,6 +2945,12 @@ str updateLabel(int ruleNumber, CSset *f
(void) lstFreqId;
(void) numIds;
 
+   #if ! INFO_WHERE_NAME_FROM
+   (void) isType;
+   (void) isOntology;
+   (void) isFK;
+   #endif
+
if (newCS) {
// realloc labels
*labels = GDKrealloc(*labels, sizeof(CSlabel) * 
freqCSset-numCSadded);
@@ -2979,6 +2985,11 @@ str updateLabel(int ruleNumber, CSset *f
case S1: // was: (S1 or S2), now combined
// use common name
label-name = name;
+   #if INFO_WHERE_NAME_FROM
+   label-isType = isType;
+   label-isOntology = isOntology;
+   label-isFK = isFK;
+   #endif
 
#if USE_MULTIWAY_MERGING
(void)ontmetadata;
@@ -2996,9 +3007,6 @@ str updateLabel(int ruleNumber, CSset *f
label-candidatesOntology = candidatesOntology;
label-candidatesFK = candidatesFK;
removeDuplicatedCandidates(label);
-   if (label-name == BUN_NONE  label-candidates[0] != 
BUN_NONE) {
-   label-name = label-candidates[0];
-   }
 
// hierarchy
if ((*labels)[freqCS1].name == label-name) {
@@ -3033,6 +3041,11 @@ str updateLabel(int ruleNumber, CSset *f
case S2:
// use common ancestor
label-name = name;
+   #if INFO_WHERE_NAME_FROM
+   label-isType = isType;
+   label-isOntology = isOntology;
+   label-isFK = isFK;
+   #endif
 
// candidates
mergedCandidates = mergeCandidates(candidatesCount, 
candidatesNew, candidatesType, candidatesOntology, candidatesFK, 
(*labels)[freqCS1], (*labels)[freqCS2], label-name);
@@ -3044,9 +3057,6 @@ str updateLabel(int ruleNumber, CSset *f
label-candidatesOntology = candidatesOntology;
label-candidatesFK = candidatesFK;
removeDuplicatedCandidates(label);
-   if (label-name == BUN_NONE  label-candidates[0] != 
BUN_NONE) {
-   label-name = label-candidates[0];
-   }
 
// hierarchy
freqCS1Counter = (*labels)[freqCS1].hierarchyCount - 1;
@@ -3080,8 +3090,14 @@ str updateLabel(int ruleNumber, CSset *f
label-candidatesFK = candidatesFK;
removeDuplicatedCandidates(label);
if (label-name == BUN_NONE  label-candidates[0] != 
BUN_NONE) {
+   // superCS had no name before, but subCS adds candidates
label-name = label-candidates[0];
-   }
+   #if INFO_WHERE_NAME_FROM
+   label-isType = (*labels)[freqCS2].isType;
+   label-isOntology = (*labels)[freqCS2].isOntology;
+   label-isFK = (*labels)[freqCS2].isFK;
+   #endif
+   } // else: old name and isType/isOntology/isFK remain valid
 
// hierarchy already set
// properties already set
@@ -3113,6 +3129,11 @@ str updateLabel(int ruleNumber, CSset *f
}
 // #endif
label-name = big.name;
+   #if INFO_WHERE_NAME_FROM
+   label-isType = big.isType;
+   label-isOntology = big.isOntology;
+   label-isFK = big.isFK;
+   #endif
 
// candidates
mergedCandidates =

MonetDB: rdf - improve layout and data presentation of survey data

2014-04-25 Thread Linnea Passing

Changeset: 2f740b0aabd2 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2f740b0aabd2
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

improve layout and data presentation of survey data
- remove quotes and language tags from strings
- indicate multi-valued properties with a star *
- indicate FK properties with a reference -ReferencedTableName
- use only last part of URI for type property values


diffs (165 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6190,8 +6190,43 @@ str printSampleData(CSSample *csSample, 
 #endif
 
 #if NO_OUTPUTFILE == 0
+static
+void printPropertyWithMarkers(FILE *fout, str propStr, CSSampleExtend 
*csSampleEx, CSPropTypes *csPropTypes, int tblId, int propId, BATiter mapi, BAT 
*mbat) {
+   // print property string
+   fprintf(fout, %s, propStr);
+
+   // add star (*) if multi-valued
+   if (csSampleEx[tblId].lstIsMVCol[propId]) {
+   fprintf(fout, *);
+   }
+
+   // add reference (-) if FK
+   if (csPropTypes[tblId].lstPropTypes[propId].isFKProp == 1) {
+   str nameStr;
+   int refTblId = csPropTypes[tblId].lstPropTypes[propId].refTblId;
+   if (csSampleEx[refTblId].candidatesOrdered[0] != BUN_NONE) { // 
table name (= best candidate) available
+#if USE_SHORT_NAMES
+   str nameStrShort;
+#endif
+   getStringName(csSampleEx[tblId].candidatesOrdered[0], 
nameStr, mapi, mbat, 1);
+#if USE_SHORT_NAMES
+   getPropNameShort(nameStrShort, nameStr);
+   fprintf(fout, -%s, nameStrShort);
+   GDKfree(nameStrShort);
+#else
+   fprintf(fout, -%s, nameStr);
+#endif
+   GDKfree(nameStr);
+   } else { // no table name
+   fprintf(fout, -Table%d, refTblId);
+   }
+   }
+}
+#endif
+
+#if NO_OUTPUTFILE == 0
 static 
-str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, 
PropStat *propStat, CSset *freqCSset){
+str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, 
PropStat *propStat, CSset *freqCSset, CSPropTypes *csPropTypes){
 
int i,j, k; 
FILE*fout, *foutrand, *foutsol, *fouttb, *foutis; 
@@ -6230,13 +6265,24 @@ str printFullSampleData(CSSampleExtend *
int found = 0;
CS  freqCS;
 
-
-   mapi = bat_iterator(mbat);
+   oid *typeAttributesOids;
+   char*isTypeProp; // 1 if property is in typeAttributes[]
+
if (TKNZRopen (NULL, schema) != MAL_SUCCEED) {
throw(RDF, rdf.rdfschema,
could not open the tokenizer\n);
}

+   // get oids for typeAttributes[]
+   typeAttributesOids = GDKmalloc(sizeof(oid) * typeAttributesCount);
+   if (!typeAttributesOids){
+   fprintf(stderr, ERROR: Couldn't malloc memory!\n);
+   }
+   for (i = 0; i  typeAttributesCount; ++i) {
+   TKNZRappend(typeAttributesOids[i], typeAttributes[i]);
+   }
+
+   mapi = bat_iterator(mbat);
 
strcpy(filename, sampleDataFull);
strcat(filename, .txt);
@@ -6336,6 +6382,24 @@ str printFullSampleData(CSSampleExtend *
else 
fprintf(fouttb,CREATE TABLE tbSample%d \n (\n, i);
 
+   // mark type columns, because their sample data is represented 
without ...
+   isTypeProp = GDKmalloc(sizeof(char) * sample.numProp);
+   if (!isTypeProp){
+   fprintf(stderr, ERROR: Couldn't malloc memory!\n);
+   }
+   for (j = 0; j  sample.numProp; ++j) {
+   isTypeProp[j] = 0;
+   }
+   for (j = 0; j  sample.numProp; ++j) {
+   for (k = 0; k  typeAttributesCount; ++k) {
+   if (sample.lstProp[j] == typeAttributesOids[k]) 
{
+   // found a type property
+   isTypeProp[j] = 1;
+   break;
+   }
+   }
+   }
+
// Compute property order (descending by support) and number of 
properties that are printed
found = 0;
numPropsInSampleTable = 
(sample.numProp(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp;
@@ -6424,7 +6488,8 @@ str printFullSampleData(CSSampleExtend *
takeOid(sample.lstProp[index], propStr);   
 #if USE_SHORT_NAMES
getPropNameShort(propStrShort, propStr);
-   fprintf(fout,|%s, propStrShort);
+

MonetDB: rdf - change selection of properties shown to the user

2014-04-25 Thread Linnea Passing

Changeset: 38f4907254da for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=38f4907254da
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

change selection of properties shown to the user
- choose discriminating, filled, text/datetime properties first; then 
discriminating, filled properties; then discriminating properties
- add all omitted properties to the output file without showing sample data
- remove reordering of props from printSampleData() because it is not used for 
generating survey data (instead, printFullSampleData() is used)


diffs (truncated from 527 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -5815,7 +5815,7 @@ void getTblName(str *name, oid nameId, B
 
 #if NO_OUTPUTFILE == 0 
 static 
-str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, 
int sampleVersion, PropStat *propStat){
+str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, 
int sampleVersion){
 
int i,j, k; 
FILE*fout, *fouttb, *foutis; 
@@ -5828,9 +5828,6 @@ str printSampleData(CSSample *csSample, 
char*   schema = rdf;
CSSamplesample; 
CS  freqCS; 
-   int*propOrder;
-   int*propOrderTfidf;
-   float*  tfidfValues;
int numPropsInSampleTable;
charobjType = 0; 
str objStr; 
@@ -5935,82 +5932,6 @@ str printSampleData(CSSample *csSample, 
//Number of tuples
fprintf(fout, %d\n, freqCS.support);
 
-   // Compute property order (descending by support) and number of 
properties that are printed
-   if (sampleVersion  1) {
-   int found = 0;
-   numPropsInSampleTable = 
(sample.numProp(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp;
-   propOrder = GDKmalloc(sizeof(int) * sample.numProp);
-   propOrderTfidf = GDKmalloc(sizeof(int) * 
sample.numProp);
-   tfidfValues = GDKmalloc(sizeof(float) * sample.numProp);
-   for (j = 0; j  sample.numProp; ++j) {
-   propOrder[j] = j;
-   propOrderTfidf[j] = j;
-   }
-
-   // To get the top NUM_PROP_SUPPORT_SAMPLE properties, 
sort all properties descending by support.
-   // The subject column remains at the first position 
regardless of its support.
-   // Sort using insertion sort.
-   for (j = 2; j  sample.numProp; ++j) {
-   int tmpPos = propOrder[j];
-   int tmpVal = freqCS.lstPropSupport[tmpPos];
-   int k = j - 1;
-   while (k = 1  
freqCS.lstPropSupport[propOrder[k]]  tmpVal) { // sort descending
-   propOrder[k + 1] = propOrder[k];
-   k--;
-   }
-   propOrder[k + 1] = tmpPos;
-   }
-
-   // To get the top NUM_PROP_TFIDF_SAMPLE properties, 
sort all properties descending by tf-idf score.
-   for (j = 1; j  sample.numProp; ++j) {
-   float tfidf;
-   BUN bun = 
BUNfnd(BATmirror(propStat-pBat),(ptr) sample.lstProp[j]);
-   if (bun == BUN_NONE) {
-   printf(Error: property not found\n);
-   } else {
-   tfidf = propStat-tfidfs[bun];
-   }
-   tfidfValues[j] = tfidf;
-   }
-
-   // Sort using insertion sort. Ignore subject column
-   for (j = 2; j  sample.numProp; ++j) {
-   int tmpPos = propOrderTfidf[j];
-   float tmpVal = tfidfValues[tmpPos];
-   int k = j - 1;
-   while (k = 1  tfidfValues[propOrderTfidf[k]] 
 tmpVal) { // sort descending
-   propOrderTfidf[k + 1] = 
propOrderTfidf[k];
-   k--;
-   }
-   propOrderTfidf[k + 1] = tmpPos;
-   }
-
-   // Add NUM_PROP_TFIDF_SAMPLE properties to propOrder 
that have a high tfidf score but are not yet in the top 1+NUM_PROP_TFIDF_SAMPLE 
values of propOrder
-   for (j = 1; j

MonetDB: rdf - change FullSampleData to print only 8 columns and...

2014-04-17 Thread Linnea Passing

Changeset: 782ccaa7dff9 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=782ccaa7dff9
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

change FullSampleData to print only 8 columns and add a file that contains the 
solutions (ordered candidates)


diffs (truncated from 369 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -5507,9 +5507,15 @@ str initFullSampleData(CSSampleExtend *c
csSampleEx[i].name = cstablestat-lstcstable[i].tblname; 
csSampleEx[i].candidateCount = tmpNumcand;
csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * 
tmpNumcand); 
+   csSampleEx[i].candidatesOrdered = (oid*)malloc(sizeof(oid) * 
tmpNumcand); 
for (k = 0; k  tmpNumcand; k++){
csSampleEx[i].candidates[k] = 
label[freqId].candidates[k]; 
-   }
+   csSampleEx[i].candidatesOrdered[k] = 
label[freqId].candidates[k]; 
+   }
+   csSampleEx[i].candidatesNew = label[freqId].candidatesNew;
+   csSampleEx[i].candidatesOntology = 
label[freqId].candidatesOntology;
+   csSampleEx[i].candidatesType = label[freqId].candidatesType;
+   csSampleEx[i].candidatesFK = label[freqId].candidatesFK;
//Randomly exchange the value, change the position k with a 
random pos
for (k = 0; k  tmpNumcand; k++){
randValue = rand() % tmpNumcand;
@@ -5650,6 +5656,7 @@ void freeSampleExData(CSSampleExtend *cs
free(csSampleEx[i].lstIsInfrequentProp);
free(csSampleEx[i].lstIsMVCol);
free(csSampleEx[i].candidates); 
+   free(csSampleEx[i].candidatesOrdered); 
free(csSampleEx[i].lstSubjOid);
for (j = 0; j  csSampleEx[i].numProp; j++){
BBPunfix(csSampleEx[i].colBats[j]-batCacheid);
@@ -6184,11 +6191,11 @@ str printSampleData(CSSample *csSample, 
 
 #if NO_OUTPUTFILE == 0
 static 
-str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat){
+str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, 
PropStat *propStat, CSset *freqCSset){
 
int i,j, k; 
-   FILE*fout, *fouttb, *foutis; 
-   charfilename[100], filename2[100], filename3[100];
+   FILE*fout, *foutsol, *fouttb, *foutis; 
+   charfilename[100], filename4[100], filename2[100], filename3[100];
int ret;
 
str propStr; 
@@ -6216,6 +6223,12 @@ str printFullSampleData(CSSampleExtend *
str propStrShort = NULL;
char*pch; 
 #endif
+   int*propOrder;
+   int*propOrderTfidf;
+   float*  tfidfValues;
+   int numPropsInSampleTable;
+   int found = 0;
+   CS  freqCS;
 
 
mapi = bat_iterator(mbat);
@@ -6227,6 +6240,9 @@ str printFullSampleData(CSSampleExtend *
 
strcpy(filename, sampleDataFull);
strcat(filename, .txt);
+
+   strcpy(filename4, sampleDataFullSolution);
+   strcat(filename4, .txt);

strcpy(filename2, createSampleTableFull);
strcat(filename2, .sh);
@@ -6235,12 +6251,15 @@ str printFullSampleData(CSSampleExtend *
strcat(filename3, .sh);

fout = fopen(filename,wt); 
+   foutsol = fopen(filename4,wt);
fouttb = fopen(filename2,wt);
foutis = fopen(filename3,wt);
 
for (i = 0; i  num; i++){
sample = csSampleEx[i];
-   fprintf(fout,Sample table %d Candidates: , i);
+   freqCS = freqCSset-items[sample.freqIdx];
+   fprintf(fout,Table %d\n, i);
+   fprintf(foutsol, Table %d\n, i);
for (j = 0; j  (int)sample.candidateCount; j++){
//fprintf(fout,BUNFMT,sample.candidates[j]);
if (sample.candidates[j] != BUN_NONE){
@@ -6251,18 +6270,43 @@ str printFullSampleData(CSSampleExtend *
getStringName(sample.candidates[j], canStr, 
mapi, mbat, 1);
 #if USE_SHORT_NAMES
getPropNameShort(canStrShort, canStr);
-   fprintf(fout,;%s,  canStrShort);
+   if (j+1 == (int)sample.candidateCount) 
fprintf(fout, %s,  canStrShort);
+   else fprintf(fout, %s;, canStrShort);
GDKfree(canStrShort);
 #else
-   fprintf(fout,;%s,  canStr);
+   if (j+1 == (int)sample.candidateCount) 
fprintf(fout, %s,  canStr);
+   else fprintf(fout, %s;, canStr);
+
 #endif
GDKfree(canStr);

MonetDB: rdf - change delimiter in sample data, split into two f...

2014-04-17 Thread Linnea Passing

Changeset: df9f9c031311 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=df9f9c031311
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

change delimiter in sample data, split into two files (instances + candidates)


diffs (162 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6196,7 +6196,7 @@ static
 str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, 
PropStat *propStat, CSset *freqCSset){
 
int i,j, k; 
-   FILE*fout, *foutsol, *fouttb, *foutis; 
+   FILE*fout, *foutrand, *foutsol, *fouttb, *foutis; 
charfilename[100], filename4[100], filename2[100], filename3[100];
int ret;
 
@@ -6254,13 +6254,15 @@ str printFullSampleData(CSSampleExtend *

fout = fopen(filename,wt); 
foutsol = fopen(filename4,wt);
+   foutrand = fopen(sampleDataFullRandom.txt,wt);
fouttb = fopen(filename2,wt);
foutis = fopen(filename3,wt);
 
for (i = 0; i  num; i++){
sample = csSampleEx[i];
freqCS = freqCSset-items[sample.freqIdx];
-   fprintf(fout,Table %d\n, i);
+   fprintf(fout,Table %d, %d tuples\n, i, freqCS.support);
+   fprintf(foutrand,Table %d, %d tuples\n, i, freqCS.support);
fprintf(foutsol, Table %d\n, i);
for (j = 0; j  (int)sample.candidateCount; j++){
//fprintf(fout,BUNFMT,sample.candidates[j]);
@@ -6272,12 +6274,12 @@ str printFullSampleData(CSSampleExtend *
getStringName(sample.candidates[j], canStr, 
mapi, mbat, 1);
 #if USE_SHORT_NAMES
getPropNameShort(canStrShort, canStr);
-   if (j+1 == (int)sample.candidateCount) 
fprintf(fout, %s,  canStrShort);
-   else fprintf(fout, %s;, canStrShort);
+   if (j+1 == (int)sample.candidateCount) 
fprintf(foutrand, %s,  canStrShort);
+   else fprintf(foutrand, %s|, canStrShort);
GDKfree(canStrShort);
 #else
-   if (j+1 == (int)sample.candidateCount) 
fprintf(fout, %s,  canStr);
-   else fprintf(fout, %s;, canStr);
+   if (j+1 == (int)sample.candidateCount) 
fprintf(foutrand, %s,  canStr);
+   else fprintf(foutrand, %s|, canStr);
 
 #endif
GDKfree(canStr); 
@@ -6292,18 +6294,18 @@ str printFullSampleData(CSSampleExtend *
 #if USE_SHORT_NAMES
getPropNameShort(canStrShort, canStr);
if (j+1 == (int)sample.candidateCount) 
fprintf(foutsol, %s (%s),  canStrShort, canStr);
-   else fprintf(foutsol, %s (%s);, canStrShort, 
canStr);
+   else fprintf(foutsol, %s (%s)|, canStrShort, 
canStr);
GDKfree(canStrShort);
 #else
if (j+1 == (int)sample.candidateCount) 
fprintf(foutsol, %s,  canStr);
-   else fprintf(foutsol, %s;, canStr);
+   else fprintf(foutsol, %s|, canStr);
 
 #endif
GDKfree(canStr); 

}
}
-   fprintf(fout, \n);
+   fprintf(foutrand, \n);
fprintf(foutsol, \n);
 
// print origin of candidates for solutions file
@@ -6335,9 +6337,6 @@ str printFullSampleData(CSSampleExtend *
else 
fprintf(fouttb,CREATE TABLE tbSample%d \n (\n, i);
 
-   //Number of tuples
-   fprintf(fout, %d\n, freqCS.support);
-
// Compute property order (descending by support) and number of 
properties that are printed
found = 0;
numPropsInSampleTable = 
(sample.numProp(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp;
@@ -6426,7 +6425,7 @@ str printFullSampleData(CSSampleExtend *
takeOid(sample.lstProp[index], propStr);   
 #if USE_SHORT_NAMES
getPropNameShort(propStrShort, propStr);
-   fprintf(fout,;%s, propStrShort);
+   fprintf(fout,|%s, propStrShort);
 
pch = strstr (propStrShort,-);
if (pch != NULL) *pch = '\0';   //Remove - characters 
from prop  //WEBCRAWL specific problem
@@ -6507,14 +6506,14 @@ str printFullSampleData(CSSampleExtend *
}
else{

MonetDB: rdf - fix delimiter in sample data, fix number of candi...

2014-04-17 Thread Linnea Passing

Changeset: f69ebb7d9f55 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f69ebb7d9f55
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

fix delimiter in sample data, fix number of candidates that are printed


diffs (24 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6501,7 +6501,7 @@ str printFullSampleData(CSSampleExtend *
if (tmpBat-ttype == TYPE_oid){ //URI or BLANK 
NODE  or MVCol
objOid = (oid *) BUNtail(tmpi, k);
if (*objOid == oid_nil){
-   fprintf(fout,;NULL);
+   fprintf(fout,|NULL);
fprintf(foutis,|NULL);
}
else{
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -403,7 +403,7 @@ typedef struct CSPropTypes {
 
 #define NUM_SAMPLETABLE 20
 #defineNUM_SAMPLE_INSTANCE 10
-#define NUM_SAMPLE_CANDIDATE 999 // print all candidates
+#define NUM_SAMPLE_CANDIDATE 9
 #define SAMPLE_FILTER_THRESHOLD 10  // SAMPLE_FILTER_THRESHOLD/ 100
 #define GETSAMPLE_BEFOREMERGING 1  // Get the sample data before merging CS's
 #define NUM_PROP_SUPPORT_SAMPLE 5 // how many properties should be added to 
the sample data because of a high support (excluding subject column)
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - do not add tables without candidates to sample data

2014-04-17 Thread Linnea Passing

Changeset: 162b64fd0507 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=162b64fd0507
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

do not add tables without candidates to sample data


diffs (11 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6260,6 +6260,7 @@ str printFullSampleData(CSSampleExtend *
 
for (i = 0; i  num; i++){
sample = csSampleEx[i];
+   if ((int)sample.candidateCount == 1  sample.candidates[0] == 
BUN_NONE) continue; // do not print tables withoud candidates
freqCS = freqCSset-items[sample.freqIdx];
fprintf(fout,Table %d, %d tuples\n, i, freqCS.support);
fprintf(foutrand,Table %d, %d tuples\n, i, freqCS.support);
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - use full URIs in sample data

2014-04-17 Thread Linnea Passing

Changeset: 1ac4a7056475 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1ac4a7056475
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

use full URIs in sample data


diffs (41 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6024,7 +6024,7 @@ str printSampleData(CSSample *csSample, 
isSite = 0; 
for (j = 0; j  numPropsInSampleTable; j++){
int index = j;
-   if (sampleVersion  1){ //Do not consider 
infreq Prop 
+   if (sampleVersion  1){
index = propOrder[index]; // apply mapping to 
change order of properties
}
 #if USE_SHORT_NAMES
@@ -6103,7 +6103,7 @@ str printSampleData(CSSample *csSample, 

for (j = 0; j  numPropsInSampleTable; j++){
int index = j;
-   if (sampleVersion  1){ //Do not 
consider infreq Prop 
+   if (sampleVersion  1){
index = propOrder[index]; // apply 
mapping to change order of properties
}
objOid = sample.lstObj[index][k];
@@ -6480,18 +6480,9 @@ str printFullSampleData(CSSampleExtend *
fprintf(foutis, echo \);
//All the instances 
for (k = 0; k  sample.numInstances; k++){
-#if USE_SHORT_NAMES
-   str subjStrShort = NULL;
-#endif
takeOid(sample.lstSubjOid[k], subjStr); 
-#if USE_SHORT_NAMES
-   getPropNameShort(subjStrShort, subjStr);
-   fprintf(fout,%s, subjStrShort);
-   fprintf(foutis,%s, subjStrShort);
-   GDKfree(subjStrShort);
-#else
+   fprintf(foutis,%s, subjStr);
fprintf(fout,%s, subjStr);
-#endif
GDKfree(subjStr); 

for (j = 0; j  numPropsInSampleTable; j++){
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - add properties with high tfidf scores to sample data

2014-04-16 Thread Linnea Passing

Changeset: b443cd8459e9 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=b443cd8459e9
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

add properties with high tfidf scores to sample data


diffs (170 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -5763,7 +5763,7 @@ void getTblName(str *name, oid nameId, B
 
 #if NO_OUTPUTFILE == 0 
 static 
-str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, 
int sampleVersion){
+str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num, 
int sampleVersion, PropStat *propStat){
 
int i,j, k; 
FILE*fout, *fouttb, *foutis; 
@@ -5777,6 +5777,8 @@ str printSampleData(CSSample *csSample, 
CSSamplesample; 
CS  freqCS; 
int*propOrder;
+   int*propOrderTfidf;
+   float*  tfidfValues;
int numPropsInSampleTable;
charobjType = 0; 
str objStr; 
@@ -5883,14 +5885,19 @@ str printSampleData(CSSample *csSample, 
 
// Compute property order (descending by support) and number of 
properties that are printed
if (sampleVersion  1) {
-   numPropsInSampleTable = 
(sample.numPropNUM_PROPS_IN_SAMPLE_DATA)?NUM_PROPS_IN_SAMPLE_DATA:sample.numProp;
+   int found = 0;
+   numPropsInSampleTable = 
(sample.numProp(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp;
propOrder = GDKmalloc(sizeof(int) * sample.numProp);
+   propOrderTfidf = GDKmalloc(sizeof(int) * 
sample.numProp);
+   tfidfValues = GDKmalloc(sizeof(float) * sample.numProp);
for (j = 0; j  sample.numProp; ++j) {
propOrder[j] = j;
-   }
-
-   // insertion sort
-   // do not sort Subject (first property), it should 
remain at the first position
+   propOrderTfidf[j] = j;
+   }
+
+   // To get the top NUM_PROP_SUPPORT_SAMPLE properties, 
sort all properties descending by support.
+   // The subject column remains at the first position 
regardless of its support.
+   // Sort using insertion sort.
for (j = 2; j  sample.numProp; ++j) {
int tmpPos = propOrder[j];
int tmpVal = freqCS.lstPropSupport[tmpPos];
@@ -5902,6 +5909,51 @@ str printSampleData(CSSample *csSample, 
propOrder[k + 1] = tmpPos;
}
 
+   // To get the top NUM_PROP_TFIDF_SAMPLE properties, 
sort all properties descending by tf-idf score.
+   for (j = 1; j  sample.numProp; ++j) {
+   float tfidf;
+   BUN bun = 
BUNfnd(BATmirror(propStat-pBat),(ptr) sample.lstProp[j]);
+   if (bun == BUN_NONE) {
+   printf(Error: property not found\n);
+   } else {
+   tfidf = propStat-tfidfs[bun];
+   }
+   tfidfValues[j] = tfidf;
+   }
+
+   // Sort using insertion sort. Ignore subject column
+   for (j = 2; j  sample.numProp; ++j) {
+   int tmpPos = propOrderTfidf[j];
+   float tmpVal = tfidfValues[tmpPos];
+   int k = j - 1;
+   while (k = 1  tfidfValues[propOrderTfidf[k]] 
 tmpVal) { // sort descending
+   propOrderTfidf[k + 1] = 
propOrderTfidf[k];
+   k--;
+   }
+   propOrderTfidf[k + 1] = tmpPos;
+   }
+
+   // Add NUM_PROP_TFIDF_SAMPLE properties to propOrder 
that have a high tfidf score but are not yet in the top 1+NUM_PROP_TFIDF_SAMPLE 
values of propOrder
+   for (j = 1; j  sample.numProp; ++j) {
+   int prop, foundProp, bound;
+   if (found == NUM_PROP_TFIDF_SAMPLE) break;
+   prop = propOrderTfidf[j];
+   // check if prop is already choosen
+   foundProp = 0;
+   bound =

MonetDB: rdf - fix number of properties printed as sample data

2014-04-16 Thread Linnea Passing

Changeset: ff5c66817286 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ff5c66817286
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

fix number of properties printed as sample data


diffs (12 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -5995,7 +5995,7 @@ str printSampleData(CSSample *csSample, 
 #endif
GDKfree(subjStr); 

-   for (j = 0; j  sample.numProp; j++){
+   for (j = 0; j  numPropsInSampleTable; j++){
int index = j;
if (sampleVersion  1){ //Do not 
consider infreq Prop 
index = propOrder[index]; // apply 
mapping to change order of properties
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - change sampleData generation to generate only 8 c...

2014-04-15 Thread Linnea Passing

Changeset: c78661b8a206 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c78661b8a206
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

change sampleData generation to generate only 8 columns (ordered by support)


diffs (169 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -761,14 +761,6 @@ char isInfrequentProp(PropTypes pt, CS c
 
 #if NO_OUTPUTFILE == 0 
 static
-char isInfrequentSampleProp(CS freqCS, int propIdx){
-   if (freqCS.lstPropSupport[propIdx] * 100  freqCS.support * 
SAMPLE_FILTER_THRESHOLD) return 1; 
-   else return 0;
-}
-#endif
-
-#if NO_OUTPUTFILE == 0 
-static
 char isInfrequentSampleCol(CS freqCS, PropTypes pt){
if (pt.propFreq * 100   freqCS.support * SAMPLE_FILTER_THRESHOLD) 
return 1;
else return 0; 
@@ -5784,6 +5776,8 @@ str printSampleData(CSSample *csSample, 
char*   schema = rdf;
CSSamplesample; 
CS  freqCS; 
+   int*propOrder;
+   int numPropsInSampleTable;
charobjType = 0; 
str objStr; 
oid objOid = BUN_NONE; 
@@ -5833,7 +5827,7 @@ str printSampleData(CSSample *csSample, 
for (i = 0; i  num; i++){
sample = csSample[i];
freqCS = freqCSset-items[sample.freqIdx];
-   fprintf(fout,Sample table %d Candidates: , i);
+   fprintf(fout,Table %d\n, i);
for (j = 0; j  (int)sample.candidateCount; j++){
//fprintf(fout,BUNFMT,sample.candidates[j]);
if (sample.candidates[j] != BUN_NONE){
@@ -5844,10 +5838,12 @@ str printSampleData(CSSample *csSample, 
getStringName(sample.candidates[j], canStr, 
mapi, mbat, 1);
 #if USE_SHORT_NAMES
getPropNameShort(canStrShort, canStr);
-   fprintf(fout,;%s,  canStrShort);
+   if (j+1 == (int)sample.candidateCount) 
fprintf(fout, %s,  canStrShort);
+   else fprintf(fout, %s;, canStrShort);
GDKfree(canStrShort);
 #else
-   fprintf(fout,;%s,  canStr);
+   if (j+1 == (int)sample.candidateCount) 
fprintf(fout, %s,  canStr);
+   else fprintf(fout, %s;, canStr);
 #endif
GDKfree(canStr); 

@@ -5882,6 +5878,35 @@ str printSampleData(CSSample *csSample, 
else 
fprintf(fouttb,CREATE TABLE tbSample%d \n (\n, i);
 
+   //Number of tuples
+   fprintf(fout, %d\n, freqCS.support);
+
+   // Compute property order (descending by support) and number of 
properties that are printed
+   if (sampleVersion  1) {
+   numPropsInSampleTable = 
(sample.numPropNUM_PROPS_IN_SAMPLE_DATA)?NUM_PROPS_IN_SAMPLE_DATA:sample.numProp;
+   propOrder = GDKmalloc(sizeof(int) * sample.numProp);
+   for (j = 0; j  sample.numProp; ++j) {
+   propOrder[j] = j;
+   }
+
+   // insertion sort
+   // do not sort Subject (first property), it should 
remain at the first position
+   for (j = 2; j  sample.numProp; ++j) {
+   int tmpPos = propOrder[j];
+   int tmpVal = freqCS.lstPropSupport[tmpPos];
+   int k = j - 1;
+   while (k = 1  
freqCS.lstPropSupport[propOrder[k]]  tmpVal) { // sort descending
+   propOrder[k + 1] = propOrder[k];
+   k--;
+   }
+   propOrder[k + 1] = tmpPos;
+   }
+
+   } else {
+   numPropsInSampleTable = sample.numProp; // all 
properties, no change in order because freqCS.lstPropSupport[] is not yet 
available
+   }
+
+
//List of columns
fprintf(fout,Subject);
fprintf(fouttb,SubjectCol string);
@@ -5891,14 +5916,15 @@ str printSampleData(CSSample *csSample, 
isDescription = 0; 
isImage = 0;
isSite = 0; 
-   for (j = 0; j  sample.numProp; j++){
+   for (j = 0; j  numPropsInSampleTable; j++){
+   int index = j;
if (sampleVersion  1){ //Do not consider 
infreq Prop 
-   if (isInfrequentSampleProp(freqCS, j)) 
continue; 
+

MonetDB: rdf - use all candidates for survey

2014-04-15 Thread Linnea Passing

Changeset: 30047a755a5c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=30047a755a5c
Modified Files:
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

use all candidates for survey


diffs (12 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -396,7 +396,7 @@ typedef struct CSPropTypes {
 
 #define NUM_SAMPLETABLE 20
 #defineNUM_SAMPLE_INSTANCE 10
-#define NUM_SAMPLE_CANDIDATE 3
+#define NUM_SAMPLE_CANDIDATE 999 // print all candidates
 #define SAMPLE_FILTER_THRESHOLD 10  // SAMPLE_FILTER_THRESHOLD/ 100
 #define GETSAMPLE_BEFOREMERGING 1  // Get the sample data before merging CS's
 #define NUM_PROPS_IN_SAMPLE_DATA 8 // how many properties should be printed 
(including subject column)
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - fix baseline for normalizing tf-idf scores

2014-04-15 Thread Linnea Passing

Changeset: 9db5008798de for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9db5008798de
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

fix baseline for normalizing tf-idf scores


diffs (30 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1413,7 +1413,7 @@ void createPropStatistics(PropStat* prop
 */
 //[DUC] Create propstat for ontology only 
 static
-void createPropStatistics(PropStat* propStat, oid** ontattributes, int 
ontattributesCount) {
+void createPropStatistics(PropStat* propStat, oid** ontattributes, int 
ontattributesCount, int ontmetadataCount) {
int i;
int numProps = 0;
 
@@ -1445,7 +1445,7 @@ void createPropStatistics(PropStat* prop
}
 
for (i = 0; i  propStat-numAdded; ++i) {
-   propStat-tfidfs[i] = log(((float)numProps) / (1 + 
propStat-freqs[i]));
+   propStat-tfidfs[i] = log(((float)ontmetadataCount) / (1 + 
propStat-freqs[i]));
}
 }
 
@@ -1475,7 +1475,7 @@ void createOntologyLookupResult(oid** re
//[DUC] Change the function for getting propStat. Use ontattributes for 
the propStat. 
// Not the properties from freqCS
//createPropStatistics(propStat, freqCSset-numCSadded, freqCSset);
-   createPropStatistics(propStat, ontattributes, ontattributesCount);
+   createPropStatistics(propStat, ontattributes, ontattributesCount, 
ontmetadataCount);
 
for (i = 0; i  freqCSset-numCSadded; ++i) {
CS  cs;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - fix baseline for normalizing tf-idf scores

2014-04-10 Thread Linnea Passing

Changeset: a6392de1b2d0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a6392de1b2d0
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

fix baseline for normalizing tf-idf scores


diffs (27 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1385,12 +1385,14 @@ void createPropStatistics(PropStat* prop
 static
 void createPropStatistics(PropStat* propStat, oid** ontattributes, int 
ontattributesCount) {
int i;
+   int numProps = 0;
 
for (i = 0; i  ontattributesCount; ++i) {
oid attr = ontattributes[1][i];
// add prop to propStat
BUN bun = BUNfnd(BATmirror(propStat-pBat), (ptr) attr);
if (bun == BUN_NONE) {
+   numProps++;
if (propStat-pBat-T-hash  BATcount(propStat-pBat) 
 4 * propStat-pBat-T-hash-mask) {
HASHdestroy(propStat-pBat);
BAThash(BATmirror(propStat-pBat), 
2*BATcount(propStat-pBat));
@@ -1413,7 +1415,7 @@ void createPropStatistics(PropStat* prop
}
 
for (i = 0; i  propStat-numAdded; ++i) {
-   propStat-tfidfs[i] = log(((float)ontattributesCount) / (1 + 
propStat-freqs[i]));
+   propStat-tfidfs[i] = log(((float)numProps) / (1 + 
propStat-freqs[i]));
}
 }
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - simplify computation of ontology-based label beca...

2014-04-10 Thread Linnea Passing

Changeset: 3967b6658444 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3967b6658444
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

simplify computation of ontology-based label because of changed order of data 
sources when assigning labels


diffs (94 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2076,7 +2076,7 @@ void removeDuplicatedCandidates(CSlabel 
 /* For one CS: Choose the best table name out of all collected candidates 
(ontology, type, fk). */
 static
 void getTableName(CSlabel* label, int csIdx,  int typeAttributesCount, 
TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, TypeStat* typeStat, int typeStatCount, oid** 
result, int* resultCount, IncidentFKs* links, oid** ontmetadata, int 
ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet) {
-   int i, j, k;
+   int i, j;
oid *tmpList;
int tmpListCount;
charnameFound = 0;
@@ -2243,9 +2243,8 @@ void getTableName(CSlabel* label, int cs
label-candidatesCount += resultCount[csIdx];
}
 
-   // one ontology class -- use it
-   if (!nameFound){
-   if (resultCount[csIdx] == 1) {
+   // chose first ontology candidate as label
+   if (!nameFound  resultCount[csIdx] = 1){
label-name = result[csIdx][0];
label-hierarchy = getOntoHierarchy(label-name, 
(label-hierarchyCount), ontmetadata, ontmetadataCount);
nameFound = 1;
@@ -2253,69 +2252,6 @@ void getTableName(CSlabel* label, int cs
label-isOntology = 1; 
#endif
}
-   }
-
-   if (!nameFound) {
-   // multiple ontology classes -- intersect with types
-   if (resultCount[csIdx]  1) {
-   tmpList = NULL;
-   tmpListCount = 0;
-   // search for type values
-   for (i = 0; i  typeAttributesCount; ++i) {
-   for (j = 0; j  
typeAttributesHistogramCount[csIdx][i]; ++j) {
-   if 
(typeAttributesHistogram[csIdx][i][j].percent  TYPE_FREQ_THRESHOLD) break; // 
sorted
-
-   // intersect type with ontology classes
-   for (k = 0; k  resultCount[csIdx]; 
++k) {
-   if (result[csIdx][k] == 
typeAttributesHistogram[csIdx][i][j].value) {
-   // found, copy ontology 
class to tmpList
-   tmpList = (oid *) 
realloc(tmpList, sizeof(oid) * (tmpListCount + 1));
-   if (!tmpList) 
fprintf(stderr, ERROR: Couldn't realloc memory!\n);
-   tmpList[tmpListCount] = 
result[csIdx][k];
-   tmpListCount += 1;
-   }
-   }
-   }
-   }
-
-   // only one left -- use it
-   if (tmpListCount == 1) {
-   label-name = tmpList[0];
-   label-hierarchy = 
getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, 
ontmetadataCount);
-   free(tmpList);
-   nameFound = 1;
-   #if INFO_WHERE_NAME_FROM
-   label-isOntology = 1; 
-   #endif
-   }
-
-   if (!nameFound) {
-   // multiple left -- use the class that covers 
most attributes, most popular ontology, ...
-   if (tmpListCount  1) {
-   label-name = tmpList[0]; // sorted
-   label-hierarchy = 
getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, 
ontmetadataCount);
-   free(tmpList);
-   nameFound = 1;
-   
-   #if INFO_WHERE_NAME_FROM
-   label-isOntology = 1; 
-   #endif
-   }
-   }
-
-   if (!nameFound) {
-   // empty intersection - use the class that 
covers most attributes, most popular ontology, ..
-   label-name = result[csIdx][0]; // sorted

MonetDB: rdf - fix compile errors when USE_MULTIWAY_MERGING is s...

2014-04-10 Thread Linnea Passing

Changeset: 608c23981c16 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=608c23981c16
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

fix compile errors when USE_MULTIWAY_MERGING is set to 1, fix mergeCandidates 
after the order of data types has been changed to (type - onto - fk)


diffs (281 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2016,7 +2016,7 @@ oid* getOntoHierarchy(oid ontology, int*
 static
 void removeDuplicatedCandidates(CSlabel *label) {
int i, j;
-   int cNew = label-candidatesNew, cOnto = label-candidatesOntology, 
cType = label-candidatesType, cFK = label-candidatesFK;
+   int cNew = label-candidatesNew, cType = label-candidatesType, cOnto = 
label-candidatesOntology, cFK = label-candidatesFK;
 
if (label-candidatesCount  2) return; // no duplicates
 
@@ -2028,8 +2028,8 @@ void removeDuplicatedCandidates(CSlabel 
// find out which category (new, onto, type, fk) we are 
in
int *cPtr = NULL;
if (j  label-candidatesNew) cPtr = cNew;
-   else if (j  label-candidatesNew + 
label-candidatesOntology) cPtr = cOnto;
-   else if (j  label-candidatesNew + 
label-candidatesOntology + label-candidatesType) cPtr = cType;
+   else if (j  label-candidatesNew + 
label-candidatesType) cPtr = cType;
+   else if (j  label-candidatesNew + 
label-candidatesType + label-candidatesOntology) cPtr = cOnto;
else cPtr = cFK;
 
if (label-candidates[i] == label-candidates[j] || 
label-candidates[j] == BUN_NONE) {
@@ -2047,8 +2047,8 @@ void removeDuplicatedCandidates(CSlabel 
// update counts
label-candidatesCount -= moveLeft;
label-candidatesNew = cNew;
+   label-candidatesType = cType;
label-candidatesOntology = cOnto;
-   label-candidatesType = cType;
label-candidatesFK = cFK;
}
 
@@ -2062,10 +2062,10 @@ void removeDuplicatedCandidates(CSlabel 
// update value in category;
if (label-candidatesNew  0) {
label-candidatesNew--;
+   } else if (label-candidatesType  0) {
+   label-candidatesType--;
} else if (label-candidatesOntology  0) {
label-candidatesOntology--;
-   } else if (label-candidatesType  0) {
-   label-candidatesType--;
} else {
label-candidatesFK--;
}
@@ -2334,8 +2334,8 @@ CSlabel* initLabels(CSset *freqCSset) {
labels[i].candidates = NULL;
labels[i].candidatesCount = 0;
labels[i].candidatesNew = 0;
+   labels[i].candidatesType = 0;
labels[i].candidatesOntology = 0;
-   labels[i].candidatesType = 0;
labels[i].candidatesFK = 0;
labels[i].hierarchy = NULL;
labels[i].hierarchyCount = 0;
@@ -2790,7 +2790,7 @@ CSlabel* createLabels(CSset* freqCSset, 
  * Result: common name ontology candidates CS1 ontology candidates CS2 
type candidates CS1 type candidates CS2 FK candidates CS1 FK candidates 
CS2
  */
 static
-oid* mergeCandidates(int *candidatesCount, int *candidatesNew, int 
*candidatesOntology, int *candidatesType, int *candidatesFK, CSlabel cs1, 
CSlabel cs2, oid commonName) {
+oid* mergeCandidates(int *candidatesCount, int *candidatesNew, int 
*candidatesType, int *candidatesOntology, int *candidatesFK, CSlabel cs1, 
CSlabel cs2, oid commonName) {
oid *candidates;
int counter = 0;
int i;
@@ -2812,38 +2812,38 @@ oid* mergeCandidates(int *candidatesCoun
}
(*candidatesNew) = counter;
 
-   // copy ontology
-   for (i = 0; i  cs1.candidatesOntology; ++i) {
+   // copy type
+   for (i = 0; i  cs1.candidatesType; ++i) {
candidates[counter] = cs1.candidates[cs1.candidatesNew + i];
counter++;
}
-   for (i = 0; i  cs2.candidatesOntology; ++i) {
+   for (i = 0; i  cs2.candidatesType; ++i) {
candidates[counter] = cs2.candidates[cs2.candidatesNew + i];
counter++;
}
-   (*candidatesOntology) = counter - (*candidatesNew);
-
-   // copy type
-   for (i = 0; i  cs1.candidatesType; ++i) {
-   candidates[counter] = cs1.candidates[cs1.candidatesNew + 
cs1.candidatesOntology + i];
+   (*candidatesType) = counter - (*candidatesNew);
+
+   // copy ontology
+   for (i = 0; i  cs1.candidatesOntology; ++i) {
+   candidates[counter] =

MonetDB: rdf - fix creation of typeAttributesHistogram

2014-03-24 Thread Linnea Passing

Changeset: 6832434de2c2 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6832434de2c2
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

fix creation of typeAttributesHistogram

type values were assigned to the wrong CS because the new csFreqIdx was used 
instead of the old one


diffs (33 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -967,9 +967,7 @@ void createTypeAttributesHistogram(BAT *
// check if property (*pbt) is a type
for (i = 0; i  typeAttributesCount; ++i) {
if (*pbt == typeAttributesOids[i]) {
-
// prop is a type!
-   csFreqIdx = csIdFreqIdxMap[subjCSMap[*sbt]];
 
// get object
obt = (oid *) BUNtloc(oi, p);
@@ -988,6 +986,7 @@ void createTypeAttributesHistogram(BAT *
// nothing to add to histogram
} else {
// analyze values and add to 
histogram
+   csFreqIdx = 
csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject

insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
typeValuesSize = 0; // reset
}
@@ -1008,7 +1007,10 @@ void createTypeAttributesHistogram(BAT *
}
 
// analyze and add last set of typeValues
-   if (curS != BUN_NONE  typeValuesSize != 0) 
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
+   if (curS != BUN_NONE  typeValuesSize != 0) {
+   csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx 
of last subject
+   insertLeafsIntoTypeAttributesHistogram(typeValues, 
typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, 
csFreqIdx, curT, ontmetaBat, ontclassSet);
+   }
 
GDKfree(typeValues);
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - Store type hierarchy for type values

2014-03-24 Thread Linnea Passing

Changeset: da0c1bd43bd3 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=da0c1bd43bd3
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
Branch: rdf
Log Message:

Store type hierarchy for type values

Suggested by Peter
Instead of storing the leaf value per subject, store the whole hierarchy. By 
doing so, the frequencies are summed up on the more general levels of the 
hierarchy.
For example, 40% Politicians and 50% Athletes in a CS will be representented as 
(90% Thing, 90% Agent, 90% Person, 50% Athlete, 40% Politician), resulting in 
label candidate Person when threshold is set to 80%.


diffs (173 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -851,47 +851,17 @@ int compareTypeAttributesFreqs (const vo
 #endif
 
 #if USE_TYPE_NAMES
-/* Analyze hierarchy in a list of type values, add all leaf values to the 
histogram. Values that are not present in the hierarchy tree built from the 
ontologies are NOT added to the histogram. */
+/* Add type values to the histogram. Values that are not present in the 
hierarchy tree built from the ontologies are NOT added to the histogram. */
 static
-void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, 
TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, 
OntClass *ontclassSet) {
-   int i, j, k;
+void insertValuesIntoTypeAttributesHistogram(oid* typeList, int 
typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) {
+   int i, j;
int fit;
-   char*leaf; // flag whether a type value in 'typeList' is a 
leaf (1) or not (0)
-   BUN pos;
-   OntClasshierarchy;
-
-   // start with: every type value is a leaf
-   leaf = GDKmalloc(sizeof(char) * typeListLength);
-   for (i = 0; i  typeListLength; ++i) leaf[i] = 1;
-
-   // analyze hierarchy
+
for (i = 0; i  typeListLength; ++i) {
-   if (!leaf[i]) continue;
-   pos = BUNfnd(BATmirror(ontmetaBat), typeList[i]);
-   if (pos == BUN_NONE) {
-   // no ontology information for this type value, 
therefore it is not added to the hierarchy
-   leaf[i] = 0;
-   continue;
-   }
-
-   // get hierarchy of this type value
-   hierarchy = ontclassSet[pos];
-
-   // loop over superclasses, set leaf=0
-   for (j = 0; j  hierarchy.numsc; ++j) {
-   for (k = 0; k  typeListLength; ++k) {
-   if (i == k) continue;
-   if (ontclassSet[hierarchy.scIdxes[j]].cOid == 
typeList[k]) {
-   // found superclass at position 'k'
-   leaf[k] = 0;
-   }
-   }
-   }
-   }
-
-   // add all leafs to the histogram
-   for (i = 0; i  typeListLength; ++i) {
-   if (!leaf[i]) continue;
+   BUN pos = BUNfnd(BATmirror(ontmetaBat), typeList[i]);
+   if (pos == BUN_NONE) continue; // no ontology information, 
ignore
+
+   // add to histogram
fit = 0;
for (j = 0; j  typeAttributesHistogramCount[csFreqIdx][type]; 
++j) {
if (typeAttributesHistogram[csFreqIdx][type][j].value 
== typeList[i]) {
@@ -913,13 +883,11 @@ void insertLeafsIntoTypeAttributesHistog

typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].freq = 1;
}
}
-
-   GDKfree(leaf);
 }
 
 /* Loop through all subjects to collect frequency statistics for type 
attribute values. */
 static
-void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass 
*ontclassSet) {
+void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) {
// looping, extracting
BUN p, q;
oid *sbt, *obt, *pbt;
@@ -987,7 +955,7 @@ void createTypeAttributesHistogram(BAT *
} else {
// analyze values and add to

MonetDB: rdf - Improve label quality

2014-03-20 Thread Linnea Passing

Changeset: 3e4ece2b7085 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3e4ece2b7085
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Improve label quality

- Computation of similarity between CS's and classes is now based on the 
assumption that all properties of a CS should belong to one ontology class, not 
that the CS has to consist of ALL properties of the corresponding ontology 
class.
- Type values are usually multi-valued properties, the values represent the 
hierarchy the subject belongs to (e.g., if a subject in the dbpedia dataset has 
type 'Athlete', it also has types 'Person', 'Agent', 'Thing'). This hierarchy 
is analyzed and only the most specific type value (the leaf) is added to the 
data structures. This improves the label candidates that are computed using 
type values.


diffs (284 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -851,19 +851,89 @@ int compareTypeAttributesFreqs (const vo
 #endif
 
 #if USE_TYPE_NAMES
+/* Analyze hierarchy in a list of type values, add all leaf values to the 
histogram. Values that are not present in the hierarchy tree built from the 
ontologies are NOT added to the histogram. */
+static
+void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, 
TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, 
OntClass *ontclassSet) {
+   int i, j, k;
+   int fit;
+   char*leaf; // flag whether a type value in 'typeList' is a 
leaf (1) or not (0)
+   BUN pos;
+   OntClasshierarchy;
+
+   // start with: every type value is a leaf
+   leaf = GDKmalloc(sizeof(char) * typeListLength);
+   for (i = 0; i  typeListLength; ++i) leaf[i] = 1;
+
+   // analyze hierarchy
+   for (i = 0; i  typeListLength; ++i) {
+   if (!leaf[i]) continue;
+   pos = BUNfnd(BATmirror(ontmetaBat), typeList[i]);
+   if (pos == BUN_NONE) {
+   // no ontology information for this type value, 
therefore it is not added to the hierarchy
+   leaf[i] = 0;
+   continue;
+   }
+
+   // get hierarchy of this type value
+   hierarchy = ontclassSet[pos];
+
+   // loop over superclasses, set leaf=0
+   for (j = 0; j  hierarchy.numsc; ++j) {
+   for (k = 0; k  typeListLength; ++k) {
+   if (i == k) continue;
+   if (ontclassSet[hierarchy.scIdxes[j]].cOid == 
typeList[k]) {
+   // found superclass at position 'k'
+   leaf[k] = 0;
+   }
+   }
+   }
+   }
+
+   // add all leafs to the histogram
+   for (i = 0; i  typeListLength; ++i) {
+   if (!leaf[i]) continue;
+   fit = 0;
+   for (j = 0; j  typeAttributesHistogramCount[csFreqIdx][type]; 
++j) {
+   if (typeAttributesHistogram[csFreqIdx][type][j].value 
== typeList[i]) {
+   // bucket exists
+   
typeAttributesHistogram[csFreqIdx][type][j].freq += 1;
+   fit = 1;
+   break;
+   }
+   }
+   if (!fit) {
+   // bucket does not exist
+   // realloc
+   typeAttributesHistogramCount[csFreqIdx][type] += 1;
+   typeAttributesHistogram[csFreqIdx][type] = 
(TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][type], 
sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][type]);
+   if (!typeAttributesHistogram[csFreqIdx][type]) 
fprintf(stderr, ERROR: Couldn't realloc memory!\n);
+
+   // insert value
+   
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].value = typeList[i];
+   
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].freq = 1;
+   }
+   }
+
+   GDKfree(leaf);
+}
+
 /* Loop through all subjects to collect frequency statistics for type 
attribute values. */
 static
-void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes) {
+void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap,

MonetDB: rdf - Workaround for memory bug: increase INIT_NUM_CS

2013-09-26 Thread Linnea Passing

Changeset: 2d593f5bbd8f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2d593f5bbd8f
Modified Files:
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Workaround for memory bug: increase INIT_NUM_CS


diffs (12 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -169,7 +169,7 @@ typedef struct SubCSSet{
int numAllocation; 
 } SubCSSet;
 
-#define INIT_NUM_CS 100
+#define INIT_NUM_CS 9  // workaround
 #define SIM_THRESHOLD 0.6
 #define SIM_TFIDF_THRESHOLD 0.55
 #define IMPORTANCE_THRESHOLD 0.01
___
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - Remove duplicate values in candidate lists and up...

2013-09-26 Thread Linnea Passing

Changeset: 6cc339a6347d for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6cc339a6347d
Modified Files:
monetdb5/extras/rdf/rdf.h
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Remove duplicate values in candidate lists and update candidate lists when 
merging CS's.


diffs (truncated from 316 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -108,6 +108,10 @@ typedef struct CSlabel {
oid name;   // table name
oid *candidates;// list of table name candidates, 
candidates[0] == name
int candidatesCount;// number of entries in the candidates 
list
+   int candidatesNew;  // number of candidates that 
are created during merging (e.g. ancestor name)
+   int candidatesOntology; // number of ontology 
candidates (first category)
+   int candidatesType; // number of type candidates 
(second category)
+   int candidatesFK;   // number of fk candidates 
(third category)
oid *hierarchy; // hierarchy bottom to top
int hierarchyCount; // number of entries in the hierarchy 
list
int numProp;// number of properties, copied from 
freqCSset-items[x].numProp
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1831,6 +1831,50 @@ oid* getOntoHierarchy(oid ontology, int*
return hierarchy;
 }
 
+/* Remove duplicated candidate values and remove DUMMY values if better 
candidates exist
+ */
+static
+void removeDuplicatedCandidates(CSlabel *label) {
+   int i, j;
+   int cNew = label-candidatesNew, cOnto = label-candidatesOntology, 
cType = label-candidatesType, cFK = label-candidatesFK;
+
+   if (label-candidatesCount  2) return; // no duplicates
+
+   // loop through all candidates
+   for (i = 0; i  label-candidatesCount - 1; ++i) {
+   // search (direction: right) whether this value occurs again
+   int moveLeft = 0;
+   for (j = i + 1; j  label-candidatesCount; ++j) {
+   // find out which category (new, onto, type, fk) we are 
in
+   int *cPtr = NULL;
+   if (j  label-candidatesNew) cPtr = cNew;
+   else if (j  label-candidatesNew + 
label-candidatesOntology) cPtr = cOnto;
+   else if (j  label-candidatesNew + 
label-candidatesOntology + label-candidatesType) cPtr = cType;
+   else cPtr = cFK;
+
+   if (label-candidates[i] == label-candidates[j] || 
label-candidates[j] == BUN_NONE) {
+   // DUMMY value will be overwritten
+   // OR:
+   // value occurs again, will be overwritten
+   moveLeft++;
+   (*cPtr)--;
+   } else {
+   // different value, keep it
+   label-candidates[j - moveLeft] = 
label-candidates[j];
+   }
+   }
+   // value 'i' is unique now
+   // update counts
+   label-candidatesCount -= moveLeft;
+   label-candidatesNew = cNew;
+   label-candidatesOntology = cOnto;
+   label-candidatesType = cType;
+   label-candidatesFK = cFK;
+   }
+
+   // DUMMY value on position 0 is kept to ensure that name == 
candidates[0]
+}
+
 #if USE_TABLE_NAME
 /* For one CS: Choose the best table name out of all collected candidates 
(ontology, type, fk). */
 static
@@ -1843,6 +1887,7 @@ void getTableName(CSlabel* label, int cs
// --- ONTOLOGY ---
// add all ontology candidates to list of candidates
if (resultCount[csIdx] = 1) {
+   label-candidatesOntology = resultCount[csIdx];
label-candidates = GDKrealloc(label-candidates, sizeof(oid) * 
(label-candidatesCount + resultCount[csIdx]));
if (!label-candidates) fprintf(stderr, ERROR: Couldn't 
realloc memory!\n);
for (i = 0; i  resultCount[csIdx]; ++i) {
@@ -1925,6 +1970,7 @@ void getTableName(CSlabel* label, int cs
// add all most frequent type values to list of candidates
if (tmpListCount = 1) {
int counter = 0;
+   label-candidatesType = tmpListCount;
label-candidates = GDKrealloc(label-candidates, sizeof(oid) * 
(label-candidatesCount + tmpListCount));
if (!label-candidates) fprintf(stderr, ERROR: Couldn't 
realloc memory!\n);
for (i = 0; i  typeStatCount; ++i) {
@@ -1965,6 +2011,7 @@ void

MonetDB: rdf - Remove dummy values from candidate lists and upda...

2013-09-26 Thread Linnea Passing

Changeset: 5caad43d9d63 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=5caad43d9d63
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Remove dummy values from candidate lists and update CS name accordingly
DUMMY values on position 0 of the candidate list will be removed. Therefore, CS 
names have to be updated to ensure that (candidates[0] == name).


diffs (69 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1872,7 +1872,24 @@ void removeDuplicatedCandidates(CSlabel 
label-candidatesFK = cFK;
}
 
-   // DUMMY value on position 0 is kept to ensure that name == 
candidates[0]
+   // remove DUMMY value on position 0
+   if (label-candidates[0] == BUN_NONE  label-candidatesCount  1) {
+   for (i = 1; i  label-candidatesCount; ++i) {
+   label-candidates[i - 1] = label-candidates[i];
+   }
+   label-candidatesCount--;
+
+   // update value in category;
+   if (label-candidatesNew  0) {
+   label-candidatesNew--;
+   } else if (label-candidatesOntology  0) {
+   label-candidatesOntology--;
+   } else if (label-candidatesType  0) {
+   label-candidatesType--;
+   } else {
+   label-candidatesFK--;
+   }
+   }
 }
 
 #if USE_TABLE_NAME
@@ -2624,6 +2641,9 @@ str updateLabel(int ruleNumber, CSset *f
label-candidatesType = candidatesType;
label-candidatesFK = candidatesFK;
removeDuplicatedCandidates(label);
+   if (label-name == BUN_NONE  label-candidates[0] != 
BUN_NONE) {
+   label-name = label-candidates[0];
+   }
 
// hierarchy
if ((*labels)[freqCS1].name == label-name) {
@@ -2667,6 +2687,9 @@ str updateLabel(int ruleNumber, CSset *f
label-candidatesType = candidatesType;
label-candidatesFK = candidatesFK;
removeDuplicatedCandidates(label);
+   if (label-name == BUN_NONE  label-candidates[0] != 
BUN_NONE) {
+   label-name = label-candidates[0];
+   }
 
// hierarchy
freqCS1Counter = (*labels)[freqCS1].hierarchyCount - 1;
@@ -2699,6 +2722,9 @@ str updateLabel(int ruleNumber, CSset *f
label-candidatesType = candidatesType;
label-candidatesFK = candidatesFK;
removeDuplicatedCandidates(label);
+   if (label-name == BUN_NONE  label-candidates[0] != 
BUN_NONE) {
+   label-name = label-candidates[0];
+   }
 
// hierarchy already set
// properties already set
@@ -2728,6 +2754,9 @@ str updateLabel(int ruleNumber, CSset *f
label-candidatesType = candidatesType;
label-candidatesFK = candidatesFK;
removeDuplicatedCandidates(label);
+   if (label-name == BUN_NONE  label-candidates[0] != 
BUN_NONE) {
+   label-name = label-candidates[0];
+   }
 
// hierarchy
label-hierarchyCount = big.hierarchyCount;
___
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - Fix use of short and long names in GraphViz export

2013-09-26 Thread Linnea Passing

Changeset: fb94f422a1f0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fb94f422a1f0
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Fix use of short and long names in GraphViz export


diffs (109 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1664,6 +1664,7 @@ void printUML2(CSset *freqCSset, CSlabel
for (i = 0; i  freqCSset-numCSadded; ++i) {
int width;
str labelStr;
+   str tmpStr;
str labelStrEscaped = NULL;
 #if USE_SHORT_NAMES
str labelStrShort = NULL;
@@ -1682,7 +1683,8 @@ void printUML2(CSset *freqCSset, CSlabel
if (!labelStrEscaped) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
strcpy(labelStrEscaped, DUMMY);
} else {
-   takeOid(labels[i].name, labelStr);
+   takeOid(labels[i].name, tmpStr);
+   labelStr = removeBrackets(tmpStr);
 #if USE_SHORT_NAMES
getPropNameShort(labelStrShort, labelStr);
labelStrEscaped = (str) GDKmalloc(sizeof(char) * 
(strlen(labelStrShort) + 1));
@@ -1707,19 +1709,13 @@ void printUML2(CSset *freqCSset, CSlabel
str propStr;
str tmpStr;
char*propStrEscaped = NULL;
+#if USE_SHORT_NAMES
char*propStrShort = NULL;
+#endif
str color;
 
takeOid(cs.lstProp[j], tmpStr);
 
-   // copy propStr to propStrEscaped because .dot-PORTs 
cannot contain colons and quotes
-   propStr = removeBrackets(tmpStr);
-   propStrEscaped = (char *) malloc(sizeof(char) * 
(strlen(propStr) + 1));
-   if (!propStrEscaped) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
-   memcpy(propStrEscaped, propStr, (strlen(propStr) + 1));
-   escapeURI(propStrEscaped);
-   getPropNameShort(propStrShort, propStr);
-
// assign color (the more tuples the property occurs 
in, the darker
if ((1.0 * cs.lstPropSupport[j])/cs.support  0.8) {
color = #FF;
@@ -1732,10 +1728,22 @@ void printUML2(CSset *freqCSset, CSlabel
} else {
color = #FF;
}
+
+   // copy propStr to propStrEscaped because .dot-PORTs 
cannot contain colons and quotes
+   propStr = removeBrackets(tmpStr);
+   propStrEscaped = (char *) malloc(sizeof(char) * 
(strlen(propStr) + 1));
+   if (!propStrEscaped) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
+   memcpy(propStrEscaped, propStr, (strlen(propStr) + 1));
+   escapeURI(propStrEscaped);
+#if USE_SHORT_NAMES
+   getPropNameShort(propStrShort, propStr);
fprintf(fout, TRTD BGCOLOR=\%s\ PORT=\%s\%s 
(%d%%)/TD/TR\n, color, propStrEscaped, propStrShort, (100 * 
cs.lstPropSupport[j])/cs.support);
+   GDKfree(propStrShort);
+#else
+   fprintf(fout, TRTD BGCOLOR=\%s\ PORT=\%s\%s 
(%d%%)/TD/TR\n, color, propStrEscaped, propStrEscaped, (100 * 
cs.lstPropSupport[j])/cs.support);
+#endif
 
GDKfree(propStr);
-   GDKfree(propStrShort);
free(propStrEscaped);
GDKfree(tmpStr); 
 
@@ -1752,6 +1760,9 @@ void printUML2(CSset *freqCSset, CSlabel
str tmpStr;
str propStr;
char*propStrEscaped = NULL;
+#if USE_SHORT_NAMES
+   char*propStrShort = NULL;
+#endif
 
takeOid(cs.lstProp[j], tmpStr);
 
@@ -1762,15 +1773,30 @@ void printUML2(CSset *freqCSset, CSlabel
memcpy(propStrEscaped, propStr, (strlen(propStr) + 1));
escapeURI(propStrEscaped);
 
+#if USE_SHORT_NAMES
+   getPropNameShort(propStrShort, propStr);
for (k = 0; k  relationMetadataCount[i][j]; ++k) {
 
if (relationMetadata[i][j][k].percent = 
FK_FREQ_THRESHOLD) {
// target of links is frequent enough, 
not an outlier
int from = 
relationMetadata[i][j][k].from;
int to = relationMetadata[i][j][k].to;
-   fprintf(fout, \BUNFMT\:\%s\ - 
\BUNFMT\ [label=\%s\];\n,

MonetDB: rdf - Update labels when CS's are merged

2013-09-25 Thread Linnea Passing

Changeset: 2b0ab4777950 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2b0ab4777950
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Update labels when CS's are merged
Updates label, hierarchy and properties. Does not update candidates yet.


diffs (truncated from 851 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1790,7 +1790,7 @@ oid* getOntoHierarchy(oid ontology, int*
 
// add 'ontology' to hierarchy
(*hierarchyCount) = 1;
-   hierarchy = (oid *) malloc(sizeof(oid) * (*hierarchyCount));
+   hierarchy = (oid *) GDKmalloc(sizeof(oid) * (*hierarchyCount));
if (!hierarchy)
fprintf(stderr, ERROR: Couldn't malloc memory!\n);
hierarchy[(*hierarchyCount) -1] = ontology;
@@ -1815,7 +1815,7 @@ oid* getOntoHierarchy(oid ontology, int*
// superclass
// add 'msuperstr' to hierarchy
(*hierarchyCount) += 1;
-   hierarchy = realloc(hierarchy, 
sizeof(oid) * (*hierarchyCount));
+   hierarchy = GDKrealloc(hierarchy, 
sizeof(oid) * (*hierarchyCount));
if (!hierarchy)
fprintf(stderr, ERROR: 
Couldn't realloc memory!\n);
hierarchy[(*hierarchyCount) -1] = 
msuper;
@@ -1843,7 +1843,7 @@ void getTableName(CSlabel* label, int cs
// --- ONTOLOGY ---
// add all ontology candidates to list of candidates
if (resultCount[csIdx] = 1) {
-   label-candidates = realloc(label-candidates, sizeof(oid) * 
(label-candidatesCount + resultCount[csIdx]));
+   label-candidates = GDKrealloc(label-candidates, sizeof(oid) * 
(label-candidatesCount + resultCount[csIdx]));
if (!label-candidates) fprintf(stderr, ERROR: Couldn't 
realloc memory!\n);
for (i = 0; i  resultCount[csIdx]; ++i) {
label-candidates[label-candidatesCount + i] = 
result[csIdx][i];
@@ -1925,7 +1925,7 @@ void getTableName(CSlabel* label, int cs
// add all most frequent type values to list of candidates
if (tmpListCount = 1) {
int counter = 0;
-   label-candidates = realloc(label-candidates, sizeof(oid) * 
(label-candidatesCount + tmpListCount));
+   label-candidates = GDKrealloc(label-candidates, sizeof(oid) * 
(label-candidatesCount + tmpListCount));
if (!label-candidates) fprintf(stderr, ERROR: Couldn't 
realloc memory!\n);
for (i = 0; i  typeStatCount; ++i) {
for (j = 0; j  tmpListCount; ++j) {
@@ -1965,7 +1965,7 @@ void getTableName(CSlabel* label, int cs
// --- FK ---
// add top3 fk values to list of candidates
if (links[csIdx].num  0) {
-   label-candidates = realloc(label-candidates, sizeof(oid) * 
(label-candidatesCount + MIN(3, links[csIdx].num)));
+   label-candidates = GDKrealloc(label-candidates, sizeof(oid) * 
(label-candidatesCount + MIN(3, links[csIdx].num)));
if (!label-candidates) fprintf(stderr, ERROR: Couldn't 
realloc memory!\n);
for (i = 0; i  MIN(3, links[csIdx].num); ++i) {
label-candidates[label-candidatesCount + i] = 
links[csIdx].fks[0].prop;
@@ -1983,7 +1983,7 @@ void getTableName(CSlabel* label, int cs
 
// --- NOTHING ---
if (label-candidatesCount == 0) {
-   label-candidates = realloc(label-candidates, sizeof(oid));
+   label-candidates = GDKrealloc(label-candidates, sizeof(oid));
if (!label-candidates) fprintf(stderr, ERROR: Couldn't 
realloc memory!\n);
label-candidates[0] = BUN_NONE;
label-candidatesCount = 1;
@@ -2004,7 +2004,7 @@ CSlabel* initLabels(CSset *freqCSset) {
CSlabel *labels;
int i;
 
-   labels = (CSlabel *) malloc(sizeof(CSlabel) * freqCSset-numCSadded);
+   labels = (CSlabel *) GDKmalloc(sizeof(CSlabel) * freqCSset-numCSadded);
if (!labels) fprintf(stderr, ERROR: Couldn't malloc memory!\n);
for (i = 0; i  freqCSset-numCSadded; ++i) {
labels[i].candidates = NULL;
@@ -2031,7 +2031,7 @@ void getAllLabels(CSlabel* labels, CSset
 
// copy attribute oids (names)
labels[i].numProp = cs.numProp;
-   labels[i].lstProp = (oid *) malloc(sizeof(oid) * cs.numProp);
+   labels[i].lstProp = (oid *) GDKmalloc(sizeof(oid) * cs.numProp);

MonetDB: rdf - Store oids instead of strings to improve performance

2013-09-19 Thread Linnea Passing

Changeset: eb32228c325e for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=eb32228c325e
Modified Files:
monetdb5/extras/rdf/rdf.h
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfontologyload.c
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

Store oids instead of strings to improve performance
Store oids during the labeling process, transform them into strings for export 
only. URI string format: http:///


diffs (truncated from 2090 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -105,13 +105,13 @@ typedef enum {
 
 // Final data structure that stores the labels for tables and attributes
 typedef struct CSlabel {
-   str name;   // table name
-   str *candidates;// list of table name candidates, 
candidates[0] == name
+   oid name;   // table name
+   oid *candidates;// list of table name candidates, 
candidates[0] == name
int candidatesCount;// number of entries in the candidates 
list
-   str *hierarchy; // hierarchy bottom to top
+   oid *hierarchy; // hierarchy bottom to top
int hierarchyCount; // number of entries in the hierarchy 
list
int numProp;// number of properties, copied from 
freqCSset-items[x].numProp
-   char**lstProp;  // attribute names (same order as in 
freqCSset-items[x].lstProp)
+   oid *lstProp;   // attribute names (same order as in 
freqCSset-items[x].lstProp)
 } CSlabel;
 
 #endif /* _RDF_H_ */
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -27,79 +27,79 @@
 // list of known ontologies
 int ontologyCount = 73;
 ontology ontologies[] = {
-{{http:, www.facebook.com, 2008}, 3},
-{{http:, facebook.com, 2008}, 3},
-{{http:, developers.facebook.com, schema}, 3},
-{{https:, www.facebook.com, 2008}, 3},
-{{http:, purl.org, dc, elements, 1.1}, 5}, // dc DublinCore
-{{http:, purl.org, dc, terms}, 4}, // DublinCore
-{{http:, purl.org, goodrelations, v1}, 4}, // GoodRelations
-{{http:, purl.org, rss, 1.0, modules}, 5},
-{{http:, purl.org, stuff}, 3},
-{{http:, www.purl.org, stuff}, 3},
-{{http:, ogp.me, ns}, 3},
-{{https:, ogp.me, ns}, 3},
-{{http:, www.w3.org, 1999, 02, 22-rdf-syntax-ns}, 5}, // rdf
-{{http:, www.w3.org, 2000, 01, rdf-schema}, 5}, // rdfs
-{{http:, www.w3.org, 2004, 02, skos, core}, 6}, // skos (Simple 
Knowledge Organization System)
-{{http:, www.w3.org, 2002, 07, owl}, 5},
-{{http:, www.w3.org, 2006, vcard, ns}, 5}, // vcard
-{{http:, www.w3.org, 2001, vcard-rdf, 3.0}, 5},
-{{http:, www.w3.org, 2003, 01, geo, wgs84_pos}, 6}, // geo
-{{http:, www.w3.org, 1999, xhtml, vocab}, 5}, // xhtml
-{{http:, search.yahoo.com, searchmonkey}, 3},
-{{https:, search.yahoo.com, searchmonkey}, 3},
-{{http:, search.yahoo.co.jp, searchmonkey}, 3},
-{{http:, g.yahoo.com, searchmonkey}, 3},
-{{http:, opengraphprotocol.org, schema}, 3},
-{{https:, opengraphprotocol.org, schema}, 3},
-{{http:, opengraph.org, schema}, 3},
-{{https:, opengraph.org, schema}, 3},
-{{http:, creativecommons.org, ns}, 3}, // cc
-{{http:, rdf.data-vocabulary.org}, 2}, // by google
-{{http:, rdfs.org, sioc, ns}, 4}, // sioc (pronounced shock, 
Semantically-Interlinked Online Communities Project)
-{{http:, xmlns.com, foaf, 0.1}, 4}, // foaf (Friend of a Friend)
-{{http:, mixi-platform.com, ns}, 3}, // japanese social graph
-{{http:, commontag.org, ns}, 3},
-{{http:, semsl.org, ontology}, 3}, // semantic web for second life
-{{http:, schema.org}, 2},
-{{http:, openelectiondata.org, 0.1}, 3},
-{{http:, search.aol.com, rdf}, 3},
-{{http:, www.loc.gov, loc.terms, relators}, 4}, // library of congress
-{{http:, dbpedia.org, ontology}, 3}, // dbo
-{{http:, dbpedia.org, resource}, 3}, // dbpedia
-{{http:, dbpedia.org, property}, 3}, // dbp
-{{http:, www.aktors.org, ontology, portal}, 4}, // akt (research, 
publications, ...)
-{{http:, purl.org, ontology, bibo}, 4}, // bibo (bibliography)
-{{http:, purl.org, ontology, mo}, 4}, // mo (music)
-{{http:, www.geonames.org, ontology}, 3}, // geonames
-{{http:, purl.org, vocab, frbr, core}, 5}, // frbr (Functional 
Requirements for Bibliographic Records)
-{{http:, www.w3.org, 2001, XMLSchema}, 4}, // xsd
-{{http:, www.w3.org, 2006, time}, 4}, // time
-{{http:, purl.org, NET, c4dm, event.owl}, 5}, // event
-{{http:, www.openarchives.org, ore, terms}, 4}, // ore (Open Archive)
-{{http:, purl.org, vocab, bio, 0.1}, 5}, // bio (biographical data)
-{{http:, www.holygoat.co.uk, owl, redwood, 0.1, tags}, 6}, // tag
-{{http:, rdfs.org, ns, void}, 4}, // void (Vocabulary of Interlinked 
Datasets)
-{{http:, www.w3.org, 2006, http}, 4}, //

MonetDB: rdf - do not use getPropNameShort during labeling

2013-09-17 Thread Linnea Passing

Changeset: 3ed3276b486d for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3ed3276b486d
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

do not use getPropNameShort during labeling
short names caused an error in creating the hierarchy


diffs (164 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1612,9 +1612,6 @@ void printUML2(CSset *freqCSset, CSlabel
 
for (j = 0; j  cs.numProp; ++j) {
char*propStrEscaped = NULL;
-#if USE_SHORT_NAMES
-   char*propStrShort = NULL;
-#endif
 
takeOid(cs.lstProp[j], tmpStr);
 
@@ -1624,9 +1621,6 @@ void printUML2(CSset *freqCSset, CSlabel
if (!propStrEscaped) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
memcpy(propStrEscaped, propStr, (strlen(propStr) + 1));
escapeURI(propStrEscaped);
-#if USE_SHORT_NAMES
-   getPropNameShort(propStrShort, propStr);
-#endif
 
for (k = 0; k  relationMetadataCount[i][j]; ++k) {
 
@@ -1634,11 +1628,7 @@ void printUML2(CSset *freqCSset, CSlabel
// target of links is frequent enough, 
not an outlier
int from = 
relationMetadata[i][j][k].from;
int to = relationMetadata[i][j][k].to;
-#if USE_SHORT_NAMES
-   fprintf(fout, \BUNFMT\:\%s\ - 
\BUNFMT\ [label=\%s\];\n, freqCSset-items[from].csId, propStrEscaped, 
freqCSset-items[to].csId, propStrShort); // print foreign keys to dot file
-#else
fprintf(fout, \BUNFMT\:\%s\ - 
\BUNFMT\ [label=\%s\];\n, freqCSset-items[from].csId, propStrEscaped, 
freqCSset-items[to].csId, propStr); // print foreign keys to dot file
-#endif
}
}
GDKfree(tmpStr);
@@ -1701,7 +1691,6 @@ str* getOntoHierarchy(str ontology, int*
foundTop = 1;
}
}
-
return hierarchy;
 }
 
@@ -1728,12 +1717,8 @@ void getTableName(CSlabel* label, int cs
 
// one ontology class -- use it
if (resultCount[csIdx] == 1) {
-#if USE_SHORT_NAMES
-   getPropNameShort((label-name), result[csIdx][0]);
-#else
label-name = (char *) malloc(sizeof(char) * 
(strlen(result[csIdx][0]) + 1));
strcpy(label-name, result[csIdx][0]);
-#endif
label-hierarchy = getOntoHierarchy(label-name, 
(label-hierarchyCount), ontmetadata, ontmetadataCount);
nameFound = 1;
}
@@ -1762,12 +1747,8 @@ void getTableName(CSlabel* label, int cs
 
// only one left -- use it
if (tmpListCount == 1) {
-#if USE_SHORT_NAMES
-   getPropNameShort((label-name), tmpList[0]);
-#else
label-name = (char *) malloc(sizeof(char) * 
(strlen(tmpList[0]) + 1));
strcpy(label-name, tmpList[0]);
-#endif
label-hierarchy = 
getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, 
ontmetadataCount);
free(tmpList);
nameFound = 1;
@@ -1776,12 +1757,8 @@ void getTableName(CSlabel* label, int cs
if (!nameFound) {
// multiple left -- use the class that covers 
most attributes, most popular ontology, ...
if (tmpListCount  1) {
-#if USE_SHORT_NAMES
-   getPropNameShort((label-name), 
tmpList[0]); // sorted
-#else
label-name = (char *) 
malloc(sizeof(char) * (strlen(tmpList[0]) + 1));
strcpy(label-name, tmpList[0]); // 
sorted
-#endif
label-hierarchy = 
getOntoHierarchy(label-name, (label-hierarchyCount), ontmetadata, 
ontmetadataCount);
free(tmpList);
nameFound = 1;
@@ -1790,12 +1767,8 @@ void getTableName(CSlabel* label, int cs
 
if (!nameFound) {
// empty intersection - use the class that 
covers most attributes, most popular ontology, ..
-#if USE_SHORT_NAMES
-   getPropNameShort((label-name), 
result[csIdx][0]); // sorted
-#else
label-name = (char *) malloc(sizeof(char) * 
(strlen(result[csIdx][0]) + 1));
strcpy(label-name, result[csIdx][0]); // sorted
-#endif
label-hierarchy =

MonetDB: rdf - Add list of candidates for each CSlabel

2013-09-16 Thread Linnea Passing

Changeset: 9aa7d8033c08 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9aa7d8033c08
Modified Files:
monetdb5/extras/rdf/rdf.h
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Add list of candidates for each CSlabel
Beside the name, a list of label candidates is stored for each CSlabel. The 
candidates are used by the CS merging algorithm.


diffs (truncated from 450 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -106,6 +106,8 @@ typedef enum {
 // Final data structure that stores the labels for tables and attributes
 typedef struct CSlabel {
str name;   // table name
+   str *candidates;// list of table name candidates, 
candidates[0] == name
+   int candidatesCount;// number of entries in the candidates 
list
str *hierarchy; // hierarchy bottom to top
int hierarchyCount; // number of entries in the hierarchy 
list
int numProp;// number of properties, copied from 
freqCSset-items[x].numProp
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1712,8 +1712,20 @@ void getTableName(CSlabel* label, int cs
int i, j, k;
str *tmpList;
int tmpListCount;
+   charnameFound = 0;
 
// --- ONTOLOGY ---
+   // add all ontology candidates to list of candidates
+   if (resultCount[csIdx] = 1) {
+   label-candidates = realloc(label-candidates, sizeof(str) * 
(label-candidatesCount + resultCount[csIdx]));
+   if (!label-candidates) fprintf(stderr, ERROR: Couldn't 
realloc memory!\n);
+   for (i = 0; i  resultCount[csIdx]; ++i) {
+   label-candidates[label-candidatesCount + i] = (char 
*) malloc(sizeof(char) * (strlen(result[csIdx][i]) + 1));
+   strcpy(label-candidates[label-candidatesCount + i], 
result[csIdx][i]);
+   }
+   label-candidatesCount += resultCount[csIdx];
+   }
+
// one ontology class -- use it
if (resultCount[csIdx] == 1) {
 #if USE_SHORT_NAMES
@@ -1721,65 +1733,74 @@ void getTableName(CSlabel* label, int cs
 #else
label-name = (char *) malloc(sizeof(char) * 
(strlen(result[csIdx][0]) + 1));
strcpy(label-name, result[csIdx][0]);
+#endif
label-hierarchy = getOntoHierarchy(label-name, 
(label-hierarchyCount), ontmetadata, ontmetadataCount);
-#endif
-   return;
+   nameFound = 1;
}
 
-   // multiple ontology classes -- intersect with types
-   if (resultCount[csIdx]  1) {
-   tmpList = NULL;
-   tmpListCount = 0;
-   // search for type values
-   for (i = 0; i  typeAttributesCount; ++i) {
-   for (j = 0; j  typeAttributesHistogramCount[csIdx][i]; 
++j) {
-   if 
(typeAttributesHistogram[csIdx][i][j].percent  TYPE_FREQ_THRESHOLD) break; // 
sorted
-   // intersect type with ontology classes
-   for (k = 0; k  resultCount[csIdx]; ++k) {
-   if (strcmp(result[csIdx][k], 
typeAttributesHistogram[csIdx][i][j].value) == 0) {
-   // found, copy ontology class 
to tmpList
-   tmpList = (str *) 
realloc(tmpList, sizeof(str) * (tmpListCount + 1));
-   if (!tmpList) fprintf(stderr, 
ERROR: Couldn't realloc memory!\n);
-   tmpList[tmpListCount] = 
result[csIdx][k]; // pointer, no copy
-   tmpListCount += 1;
+   if (!nameFound) {
+   // multiple ontology classes -- intersect with types
+   if (resultCount[csIdx]  1) {
+   tmpList = NULL;
+   tmpListCount = 0;
+   // search for type values
+   for (i = 0; i  typeAttributesCount; ++i) {
+   for (j = 0; j  
typeAttributesHistogramCount[csIdx][i]; ++j) {
+   if 
(typeAttributesHistogram[csIdx][i][j].percent  TYPE_FREQ_THRESHOLD) break; // 
sorted
+   // intersect type with ontology classes
+   for (k = 0; k  resultCount[csIdx]; 
++k) {
+   if (strcmp(result[csIdx][k], 
typeAttributesHistogram[csIdx][i][j].value) == 0) {
+   // found, copy ontology 
class to

MonetDB: rdf - Add ontology tree

2013-09-12 Thread Linnea Passing

Changeset: 4f9d12a701c4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4f9d12a701c4
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

Add ontology tree
Stores distribution of data, used for CS merging


diffs (truncated from 331 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1656,7 +1656,6 @@ void printUML2(CSset *freqCSset, CSlabel
TKNZRclose(ret);
 }
 
-#if USE_TABLE_NAME
 static
 str* getOntoHierarchy(str ontology, int* hierarchyCount, str** ontmetadata, 
int ontmetadataCount) {
int i;
@@ -1677,7 +1676,7 @@ str* getOntoHierarchy(str ontology, int*
// lookup superclass
int foundTuple = 0;
for (i = 0; i  ontmetadataCount; ++i) {
-   str muristr = ontmetadata[0][i];
+   str muristr = ontmetadata[0][i];
str msuperstr = ontmetadata[1][i];
if (strcmp(hierarchy[(*hierarchyCount) - 1], muristr) 
== 0) {
// found entry
@@ -1707,8 +1706,6 @@ str* getOntoHierarchy(str ontology, int*
 
return hierarchy;
 }
-#endif
-
 
 #if USE_TABLE_NAME
 /* For one CS: Choose the best table name out of all collected candidates 
(ontology, type, fk). */
@@ -1972,6 +1969,182 @@ void createLinks(CSset* freqCSset, Relat
 #endif
 
 static
+void createOntoUsageTreeStatistics(OntoUsageNode* tree, int numTuples) {
+   int i;
+
+   if (tree-numChildren == 0) {
+   // leaf node
+   tree-numOccurancesSum = tree-numOccurances;
+   tree-percentage = (1.0 * tree-numOccurancesSum) / numTuples;
+   } else {
+   // inner node
+   tree-numOccurancesSum = tree-numOccurances;
+   for (i = 0; i  tree-numChildren; ++i) {
+   createOntoUsageTreeStatistics(tree-lstChildren[i], 
numTuples);
+   // sum up data
+   tree-numOccurancesSum += 
tree-lstChildren[i]-numOccurancesSum;
+   }
+   tree-percentage = (1.0 * tree-numOccurancesSum) / numTuples;
+   }
+}
+
+static
+void addToOntoUsageTree(OntoUsageNode* tree, str* hierarchy, int 
hierarchyCount, int numTuples) {
+   int i;
+   str uri;
+   OntoUsageNode   *leaf;
+
+   if (hierarchyCount == 0) {
+   // found position in tree
+// tree-numOccurances += numTuples; // TODO cs.support not yet 
available
+   tree-numOccurances += 1;
+   return;
+   }
+
+   // search through children
+   uri  = hierarchy[hierarchyCount - 1];
+   hierarchyCount--;
+   for (i = 0; i  tree-numChildren; ++i) {
+   if (strcmp(tree-lstChildren[i]-uri, uri) == 0) {
+   // found
+   addToOntoUsageTree(tree-lstChildren[i], hierarchy, 
hierarchyCount, numTuples);
+   return;
+   }
+   }
+
+   // child not found
+   // create leaf
+   leaf = (OntoUsageNode *) malloc(sizeof(OntoUsageNode));
+   if (!leaf)
+   fprintf(stderr, ERROR: Couldn't malloc memory!\n);
+   leaf-parent = tree;
+   leaf-uri = (str) malloc(sizeof(char) * (strlen(uri) + 1));
+   if (!leaf-uri)
+   fprintf(stderr, ERROR: Couldn't malloc memory!\n);
+   strcpy(leaf-uri, uri);
+   leaf-lstChildren = NULL;
+   leaf-numChildren = 0;
+   leaf-numOccurances = 0;
+   leaf-numOccurancesSum = 0;
+   leaf-percentage = 0.0;
+   // add to tree
+   tree-numChildren++;
+   tree-lstChildren = realloc(tree-lstChildren, sizeof(OntoUsageNode *) 
* tree-numChildren);
+   if (!tree-lstChildren)
+   fprintf(stderr, ERROR: Couldn't realloc memory!\n);
+   tree-lstChildren[tree-numChildren - 1] = leaf;
+   // call
+   addToOntoUsageTree(leaf, hierarchy, hierarchyCount, numTuples);
+}
+
+
+static
+void printTree(OntoUsageNode* tree, int level) {
+   int i;
+   printf(Level %d URI %s Count %d Sum %d Percent %.1f\n, level, 
tree-uri, tree-numOccurances, tree-numOccurancesSum, tree-percentage * 100);
+   for (i = 0; i  tree-numChildren; ++i) {
+   printTree(tree-lstChildren[i], level+1);
+   }
+}
+
+static
+void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, str** 
ontmetadata, int ontmetadataCount, str** result, int* resultCount, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount) {
+   int i, j, k;
+   str *tmpList;
+   int tmpListCount;
+   int numTuples = 0;
+
+   // init tree with an artifical root node
+

MonetDB: rdf - First draft of createFinalLabels, including new U...

2013-09-10 Thread Linnea Passing

Changeset: 8c25b051ed3a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8c25b051ed3a
Modified Files:
monetdb5/extras/rdf/rdf.h
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

First draft of createFinalLabels, including new UML diagram generation


diffs (truncated from 973 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -103,4 +103,13 @@ typedef enum {
 
 #define N_GRAPH_BAT (MAP_LEX+1)
 
+// Final data structure that stores the labels for tables and attributes
+typedef struct CSlabel {
+   str name;   // table name
+   str *hierarchy; // hierarchy bottom to top
+   int hierarchyCount; // number of entries in the hierarchy 
list
+   int numProp;// number of properties, copied from 
freqCSset-items[x].numProp
+   char**lstProp;  // attribute names (same order as in 
freqCSset-items[x].lstProp)
+} CSlabel;
+
 #endif /* _RDF_H_ */
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -300,6 +300,73 @@ Relation*** initRelationMetadata(int** r
return relationMetadata;
 }
 
+/* Calculate frequency per foreign key relationship. */
+static
+Relation*** initRelationMetadata2(int** relationMetadataCount, CSmergeRel* 
csRelBetweenMergeFreqSet, CSset* freqCSset) {
+   int i, j, k;
+   Relation*** relationMetadata;
+
+   int ret;
+   char*   schema = rdf;
+
+   TKNZRopen (NULL, schema);
+
+   relationMetadata = (Relation ***) malloc(sizeof(Relation **) * 
freqCSset-numCSadded);
+   if (!relationMetadata) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   for (i = 0; i  freqCSset-numCSadded; ++i) { // CS
+   CS cs;
+   if (i == -1) continue; // ignore
+   cs = (CS) freqCSset-items[i];
+   relationMetadata[i] = (Relation **) malloc (sizeof(Relation *) 
* cs.numProp);
+   if (!relationMetadata[i]) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
+   for (j = 0; j  cs.numProp; ++j) { // propNo in CS order
+   int sum = 0;
+   relationMetadataCount[i][j] = 0;
+   relationMetadata[i][j] = NULL;
+   for (k = 0; k  csRelBetweenMergeFreqSet[i].numRef; 
++k) { // propNo in CSrel
+
+   if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
cs.lstProp[j]) {
+   int toId = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
+   if (toId == -1) continue; // ignore
+   relationMetadataCount[i][j] += 1;
+
+   // alloc/realloc
+   if (relationMetadataCount[i][j] == 1) {
+   // alloc
+   relationMetadata[i][j] = 
(Relation *) malloc (sizeof(Relation));
+   if (!relationMetadata[i][j]) 
fprintf(stderr, ERROR: Couldn't malloc memory!\n);
+   relationMetadata[i][j][0].to = 
toId;
+   relationMetadata[i][j][0].from 
= i;
+   relationMetadata[i][j][0].freq 
= csRelBetweenMergeFreqSet[i].lstCnt[k];
+   
relationMetadata[i][j][0].percent = -1;
+   } else {
+   // realloc
+   relationMetadata[i][j] = 
(Relation *) realloc(relationMetadata[i][j], sizeof(Relation) * 
relationMetadataCount[i][j]);
+   if (!relationMetadata[i][j]) 
fprintf(stderr, ERROR: Couldn't realloc memory!\n);
+   
relationMetadata[i][j][relationMetadataCount[i][j] - 1].to = toId;
+   
relationMetadata[i][j][relationMetadataCount[i][j] - 1].from = i;
+   
relationMetadata[i][j][relationMetadataCount[i][j] - 1].freq = 
csRelBetweenMergeFreqSet[i].lstCnt[k];
+   
relationMetadata[i][j][relationMetadataCount[i][j] - 1].percent = -1;
+   }
+   }
+   }
+
+   // get total count of values
+   for (k = 0; k

MonetDB: rdf - fix memory leak in URI tokenization

2013-09-09 Thread Linnea Passing

Changeset: dacb05d87466 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=dacb05d87466
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

fix memory leak in URI tokenization


diffs (45 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -107,7 +107,7 @@ ontology ontologies[] = {
 static
 void getPropNameShort(char** name, char* propStr) {
char*token;
-   char*uri;
+   char*uri, *uriPtr;
int length = 0; // number of tokens
char**tokenizedUri = NULL;  // list of tokens
int i, j;
@@ -117,6 +117,7 @@ void getPropNameShort(char** name, char*
uri = (char *) malloc(sizeof(char) * (strlen(propStr) + 1));
if (!uri) fprintf(stderr, ERROR: Couldn't malloc memory!\n);
strcpy(uri, propStr); // uri will be modified during tokenization
+   uriPtr = uri; // uri will be modified, uriPtr keeps original pointer
token = strtok(uri, /#);
while (token != NULL) {
tokenizedUri = realloc(tokenizedUri, sizeof(char*) * ++length);
@@ -124,6 +125,7 @@ void getPropNameShort(char** name, char*
tokenizedUri[length - 1] = token;
token = strtok(NULL, /#);
}
+   free(uriPtr);
 
// match with ontologies
for (j = 0; j  ontologyCount; ++j) {
@@ -159,7 +161,7 @@ void getPropNameShort(char** name, char*
 
// no matching ontology found, return content of last token
 
-   if (length == 1) {
+   if (length = 1) {
// value
(*name) = (char *) malloc(sizeof(char) * (strlen(propStr) + 1));
if (!(*name)) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
@@ -171,7 +173,6 @@ void getPropNameShort(char** name, char*
}
 
free(tokenizedUri);
-   free(uri);
return;
 }
 #endif
___
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - Fix check (csId == -1) when using csIdFreqIdxMap

2013-09-09 Thread Linnea Passing

Changeset: 66f9493e49b6 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=66f9493e49b6
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Fix check (csId == -1) when using csIdFreqIdxMap


diffs (27 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -246,9 +246,10 @@ Relation*** initRelationMetadata(int** r
relationMetadata = (Relation ***) malloc(sizeof(Relation **) * 
freqCSset-numCSadded);
if (!relationMetadata) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
for (i = 0; i  num; ++i) { // CS
+   CS cs;
int csId = csIdFreqIdxMap[i];
-   CS cs = (CS) freqCSset-items[csId];
if (csId == -1) continue; // ignore
+   cs = (CS) freqCSset-items[csId];
relationMetadata[csId] = (Relation **) malloc (sizeof(Relation 
*) * cs.numProp);
if (!relationMetadata[csId]) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
for (j = 0; j  cs.numProp; ++j) { // propNo in CS order
@@ -491,9 +492,10 @@ void createSQLMetadata(CSset* freqCSset,
 
// set values
for (i = 0; i  num; ++i) {
+   CS cs;
int csId = csIdFreqIdxMap[i];
-   CS cs = (CS) freqCSset-items[csId];
if (csId == -1) continue; // ignore
+   cs = (CS) freqCSset-items[csId];
 
for (j = 0; j  cs.numProp; ++j) { // propNo in CS order
// check foreign key frequency
___
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - GDKfree strings that are allocated by takeOid()

2013-09-04 Thread Linnea Passing

Changeset: 008947889f2f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=008947889f2f
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

GDKfree strings that are allocated by takeOid()


diffs (283 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -312,27 +312,45 @@ IncidentFKs* initLinks(int csCount) {
return links;
 }
 
-/* Modifies the parameter! */
-/* from:   URI/ or URI   to:   URI */
+/* from:   URI/ or URI/ or URI or URI/   to:   URI */
 static
-void removeBrackets(char** s) {
-   if (strlen(*s)  2) return;
+str removeBrackets(char* s) {
+   str retStr;
 
-   if ((*s)[0] == ''  (*s)[strlen(*s) - 2] == ''  (*s)[strlen(*s) - 
1] == '/') {
+   if (s[0] == ''  s[strlen(s) - 2] == ''  s[strlen(s) - 1] == '/') {
// case URI/
-   (*s)[strlen(*s) - 2] = '\0';
-   (*s) += 1;
-   } else if ((*s)[0] == ''  (*s)[strlen(*s) - 2] == '/'  
(*s)[strlen(*s) - 1] == '') {
+   retStr = (str) GDKmalloc(strlen(s) - 2);
+   if (!retStr) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   strncpy(retStr, s + 1, strlen(s) - 3);
+   retStr[strlen(s) - 3] = '\0';
+   return retStr;
+   } else if (s[0] == ''  s[strlen(s) - 2] == '/'  s[strlen(s) - 1] 
== '') {
// case URI/
-   (*s)[strlen(*s) - 2] = '\0';
-   (*s) += 1;
-   } else if ((*s)[0] == ''  (*s)[strlen(*s) - 1] == '') {
+   retStr = (str) GDKmalloc(strlen(s) - 2);
+   if (!retStr) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   strncpy(retStr, s + 1, strlen(s) - 3);
+   retStr[strlen(s) - 3] = '\0';
+   return retStr;
+   } else if (s[0] == ''  s[strlen(s) - 1] == '') {
// case URI
-   (*s)[strlen(*s) - 1] = '\0';
-   (*s) += 1;
-   } else if ((*s)[strlen(*s) - 1] == '/') {
+   retStr = (str) GDKmalloc(strlen(s) - 1);
+   if (!retStr) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   strncpy(retStr, s + 1, strlen(s) - 2);
+   retStr[strlen(s) - 2] = '\0';
+   return retStr;
+   } else if (s[strlen(s) - 1] == '/') {
// case URI/
-   (*s)[strlen(*s) - 1] = '\0';
+   retStr = (str) GDKmalloc(strlen(s));
+   if (!retStr) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   strncpy(retStr, s + 1, strlen(s) - 1);
+   retStr[strlen(s) - 1] = '\0';
+   return retStr;
+   } else {
+   // copy
+   retStr = (str) GDKmalloc(strlen(s) + 1);
+   if (!retStr) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   strcpy(retStr, s);
+   return retStr;
}
 }
 
@@ -567,8 +585,7 @@ void createTypeAttributesHistogram(BAT *
BUN p, q;
oid *sbt, *obt, *pbt;
charobjType;
-   str propStr, objStr;
-   char*objStrPtr;
+   str propStr, objStr, tmpStr;
 
char*start, *end;
int length;
@@ -620,9 +637,8 @@ void createTypeAttributesHistogram(BAT *
 
if (objType == URI || objType == BLANKNODE) {
objOid = objOid - ((oid)objType  
(sizeof(BUN)*8 - 4));
-   takeOid(objOid, objStr);
-   removeBrackets(objStr);
-   objStrPtr = objStr;
+   takeOid(objOid, tmpStr);
+   objStr = removeBrackets(tmpStr);
} else {
objOid = objOid - (objType*2 + 1) *  
RDF_MIN_LITERAL;   /* Get the real objOid from Map or Tokenizer */
bun = BUNfirst(mapbat);
@@ -633,19 +649,15 @@ void createTypeAttributesHistogram(BAT *
end = strrchr(objStr, '');
if (start != NULL  end != NULL) {
length = end - start;
-   objStrPtr = (char *) 
malloc(sizeof(char) * (length + 1));
-   if (!objStrPtr) fprintf(stderr, 
ERROR: Couldn't malloc memory!\n);
-   memcpy(objStrPtr, start, 
length);
-   objStrPtr[length] = '\0';
-   } else {
-   objStrPtr = objStr;
+

MonetDB: rdf - Fix segfault on freeing lstObj[]

2013-09-04 Thread Linnea Passing

Changeset: 7eb1425edd90 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=7eb1425edd90
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

Fix segfault on freeing lstObj[]


diffs (23 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -987,11 +987,16 @@ void freeCSset(CSset *csSet){
int i;
for(i = 0; i  csSet-numCSadded; i ++){
free(csSet-items[i].lstProp);
-   #if STOREFULLCS
+
+   }
+
+   #if STOREFULLCS
+   for(i = 0; i  csSet-numOrigFreqCS; i ++){
free(csSet-items[i].lstObj);
-   #endif
-
}
+   #endif
+
+
free(csSet-items);
free(csSet);
 }
___
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - create labels for freqCS, not maxCS/mergeCS

2013-08-12 Thread Linnea Passing

Changeset: 72b6716bcfd7 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=72b6716bcfd7
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

create labels for freqCS, not maxCS/mergeCS


diffs (truncated from 394 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -221,7 +221,6 @@ int** initRelationMetadataCount(CSset* f
if (!relationMetadataCount) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
for (i = 0; i  freqCSset-numCSadded; ++i) {
relationMetadataCount[i] = NULL;
-   if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore
relationMetadataCount[i] = (int *) malloc(sizeof(int) * 
freqCSset-items[i].numProp);
if (!relationMetadataCount[i]) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
for (j = 0; j  freqCSset-items[i].numProp; ++j) {
@@ -234,7 +233,7 @@ int** initRelationMetadataCount(CSset* f
 
 /* Calculate frequency per foreign key relationship. */
 static
-Relation*** initRelationMetadata(int** relationMetadataCount, CSmergeRel* 
csRelBetweenMergeFreqSet, CSset* freqCSset) {
+Relation*** initRelationMetadata(int** relationMetadataCount, CSrel* csrelSet, 
int num, CSset* freqCSset, int* csIdFreqIdxMap) {
int i, j, k;
Relation*** relationMetadata;
 
@@ -245,49 +244,51 @@ Relation*** initRelationMetadata(int** r
 
relationMetadata = (Relation ***) malloc(sizeof(Relation **) * 
freqCSset-numCSadded);
if (!relationMetadata) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
-   for (i = 0; i  freqCSset-numCSadded; ++i) { // CS
-   CS cs = (CS) freqCSset-items[i];
-   if (cs.parentFreqIdx != -1) continue; // ignore
-   relationMetadata[i] = (Relation **) malloc (sizeof(Relation *) 
* cs.numProp);
-   if (!relationMetadata[i]) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
+   for (i = 0; i  num; ++i) { // CS
+   int csId = csIdFreqIdxMap[i];
+   CS cs = (CS) freqCSset-items[csId];
+   if (csId == -1) continue; // ignore
+   relationMetadata[csId] = (Relation **) malloc (sizeof(Relation 
*) * cs.numProp);
+   if (!relationMetadata[csId]) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
for (j = 0; j  cs.numProp; ++j) { // propNo in CS order
int sum = 0;
-   relationMetadataCount[i][j] = 0;
-   relationMetadata[i][j] = NULL;
-   for (k = 0; k  csRelBetweenMergeFreqSet[i].numRef; 
++k) { // propNo in CSrel
+   relationMetadataCount[csId][j] = 0;
+   relationMetadata[csId][j] = NULL;
+   for (k = 0; k  csrelSet[i].numRef; ++k) { // propNo in 
CSrel
 
-   if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
cs.lstProp[j]) {
-   int toId = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
-   relationMetadataCount[i][j] += 1;
+   if (csrelSet[i].lstPropId[k] == cs.lstProp[j]) {
+   int toId = csIdFreqIdxMap[ 
csrelSet[i].lstRefCSoid[k] ];
+   if (toId == -1) continue; // ignore
+   relationMetadataCount[csId][j] += 1;
 
// alloc/realloc
-   if (relationMetadataCount[i][j] == 1) {
+   if (relationMetadataCount[csId][j] == 
1) {
// alloc
-   relationMetadata[i][j] = 
(Relation *) malloc (sizeof(Relation));
-   if (!relationMetadata[i][j]) 
fprintf(stderr, ERROR: Couldn't malloc memory!\n);
-   relationMetadata[i][j][0].to = 
toId;
-   relationMetadata[i][j][0].from 
= i;
-   relationMetadata[i][j][0].freq 
= csRelBetweenMergeFreqSet[i].lstCnt[k];
-   
relationMetadata[i][j][0].percent = -1;
+   relationMetadata[csId][j] = 
(Relation *) malloc (sizeof(Relation));
+   if (!relationMetadata[csId][j]) 
fprintf(stderr, ERROR: Couldn't malloc memory!\n);
+   relationMetadata[csId][j][0].to 
= toId;
+

MonetDB: rdf - Add directory for ontology metadata

2013-08-12 Thread Linnea Passing

Changeset: c8bf33c699c9 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c8bf33c699c9
Added Files:
monetdb5/extras/rdf/ontmetadata/loadOntologySAMPLE.sql
monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh
monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia.csv
monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia351.csv
monetdb5/extras/rdf/ontmetadata/ontAttribute.gr.csv
monetdb5/extras/rdf/ontmetadata/ontMetadata.dbpedia.csv
monetdb5/extras/rdf/ontmetadata/ontMetadata.dbpedia351.csv
monetdb5/extras/rdf/ontmetadata/ontMetadata.gr.csv
Branch: rdf
Log Message:

Add directory for ontology metadata


diffs (truncated from 35690 to 300 lines):

diff --git a/monetdb5/extras/rdf/ontmetadata/loadOntologySAMPLE.sql 
b/monetdb5/extras/rdf/ontmetadata/loadOntologySAMPLE.sql
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/ontmetadata/loadOntologySAMPLE.sql
@@ -0,0 +1,2 @@
+COPY NUMMETADATA RECORDS INTO ontmetadata  FROM 
'/export/scratch2/linnea/scripts/loadOntology/ontMetadata.csv' USING 
DELIMITERS '|', '\n';
+COPY NUMATTRIBUTES RECORDS INTO ontattributes FROM 
'/export/scratch2/linnea/scripts/loadOntology/ontAttribute.csv' USING 
DELIMITERS '|', '\n';
diff --git a/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh 
b/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh
new file mode 100755
--- /dev/null
+++ b/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh
@@ -0,0 +1,19 @@
+NUMMETADATA=`cat ontMetadata.dbpedia.csv | wc -l`
+NUMATTRIBUTES=`cat ontAttribute.dbpedia.csv | wc -l`
+
+cp loadOntologySAMPLE.sql loadtmp.sql
+sed -i s:NUMMETADATA:$NUMMETADATA:g loadtmp.sql
+sed -i s:NUMATTRIBUTES:$NUMATTRIBUTES:g loadtmp.sql
+
+mclient -d dbpedia --port=5  loadtmp.sql
+
+
+
+NUMMETADATA=`cat ontMetadata.gr.csv | wc -l`
+NUMATTRIBUTES=`cat ontAttribute.gr.csv | wc -l`
+
+cp loadOntologySAMPLE.sql loadtmp.sql
+sed -i s:NUMMETADATA:$NUMMETADATA:g loadtmp.sql
+sed -i s:NUMATTRIBUTES:$NUMATTRIBUTES:g loadtmp.sql
+
+mclient -d dbpedia --port=5  loadtmp.sql
diff --git a/monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia.csv 
b/monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia.csv
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/ontmetadata/ontAttribute.dbpedia.csv
@@ -0,0 +1,15861 @@
+http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/academicDiscipline
+http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/impactFactor
+http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/impactFactorAsOf
+http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/isPeerReviewed
+http://dbpedia.org/ontology/AcademicJournal|http://dbpedia.org/ontology/jstor
+http://dbpedia.org/ontology/Game|http://dbpedia.org/ontology/equipment
+http://dbpedia.org/ontology/Activity|http://dbpedia.org/ontology/equipment
+http://dbpedia.org/ontology/Sport|http://dbpedia.org/ontology/equipment
+http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/arielAward
+http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/arielAward
+http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/arielAward
+http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/geminiAward
+http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/geminiAward
+http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/geminiAward
+http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/goldenCalfAward
+http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/goldenCalfAward
+http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/goldenCalfAward
+http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/goldenRaspberryAward
+http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/goldenRaspberryAward
+http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/goldenRaspberryAward
+http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/iftaAward
+http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/iftaAward
+http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/iftaAward
+http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/laurenceOlivierAward
+http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/laurenceOlivierAward
+http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/laurenceOlivierAward
+http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/naacpImageAward
+http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/naacpImageAward
+http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/naacpImageAward
+http://dbpedia.org/ontology/Actor|http://dbpedia.org/ontology/nationalFilmAward
+http://dbpedia.org/ontology/AdultActor|http://dbpedia.org/ontology/nationalFilmAward
+http://dbpedia.org/ontology/VoiceActor|http://dbpedia.org/ontology/nationalFilmAward

MonetDB: rdf - Store explicit metadata (tables and relationships)

2013-07-30 Thread Linnea Passing

Changeset: a536099d8d69 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a536099d8d69
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Store explicit metadata (tables and relationships)
Two tables are created to store information about relationships between tables 
and #tuples per table


diffs (117 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -322,13 +322,14 @@ void escapeURI(char* s) {
 }
 
 /* Modifies the parameter! */
-/* Replaces colons, quotes, spaces, and dashes with underscores. */
+/* Replaces colons, quotes, spaces, and dashes with underscores. All 
lowercase. */
 static
 void escapeURIforSQL(char* s) {
int i;
 
for (i = 0; i  (int) strlen(s); ++i) {
if (s[i] == ':' || s[i] == '' || s[i] == ' ' || s[i] == '-') 
s[i] = '_';
+   s[i] = tolower(s[i]);
}
 }
 
@@ -364,7 +365,7 @@ void convertToSQL(CSset *freqCSset, Rela
if ( freqCSset-items[i].parentFreqIdx != -1) continue; // 
ignore
strcpy(temp, labels[i].name);
escapeURIforSQL(temp);
-   fprintf(fout, CREATE TABLE %s_BUNFMT (\nsubject VARCHAR(10) 
PRIMARY KEY,\n, temp, freqCSset-items[i].csId); // TODO uppercase? 
underscores?
+   fprintf(fout, CREATE TABLE %s_BUNFMT (\nsubject VARCHAR(10) 
PRIMARY KEY,\n, temp, freqCSset-items[i].csId); // TODO underscores?
for (j = 0; j  labels[i].numProp; ++j) {
char temp2[100];
strcpy(temp2, labels[i].lstProp[j]);
@@ -411,6 +412,80 @@ void convertToSQL(CSset *freqCSset, Rela
TKNZRclose(ret);
 }
 
+static
+void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet, 
Labels* labels) {
+   char**matrix = NULL; // matrix[from][to]
+   int i, j, k;
+   FILE*fout;
+
+   // init
+   matrix = (char **) malloc(sizeof(char *) * freqCSset-numCSadded);
+   if (!matrix) fprintf(stderr, ERROR: Couldn't malloc memory!\n);
+
+   for (i = 0; i  freqCSset-numCSadded; ++i) {
+   matrix[i] = (char *) malloc(sizeof(char *) * 
freqCSset-numCSadded);
+   if (!matrix) fprintf(stderr, ERROR: Couldn't realloc 
memory!\n);
+
+   for (j = 0; j  freqCSset-numCSadded; ++j) {
+   matrix[i][j] = 0;
+   }
+   }
+
+   // set values
+   for (i = 0; i  freqCSset-numCSadded; ++i) {
+   if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore
+
+   for (j = 0; j  freqCSset-items[i].numProp; ++j) { // propNo 
in CS order
+   // check foreign key frequency
+   int sum = 0;
+   for (k = 0; k  csRelBetweenMergeFreqSet[i].numRef; 
++k) {
+   if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
freqCSset-items[i].lstProp[j]) {
+   sum += 
csRelBetweenMergeFreqSet[i].lstCnt[k];
+   }
+   }
+
+   for (k = 0; k  csRelBetweenMergeFreqSet[i].numRef; 
++k) { // propNo in CSrel
+   if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
freqCSset-items[i].lstProp[j]) {
+   int to = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
+   if (i == to) continue; // ignore self 
references
+   if ((int) (100.0 * 
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5)  FK_FREQ_THRESHOLD) 
continue; // foreign key is not frequent enough
+   matrix[i][to] = 1;
+   }
+   }
+   }
+   }
+
+   // store matrix as csv
+   fout = fopen(adjacencyList.csv, wt);
+   for (i = 0; i  freqCSset-numCSadded; ++i) {
+   for (j = 0; j  freqCSset-numCSadded; ++j) {
+   if (matrix[i][j]) {
+   fprintf(fout, \%d\,\%d\\n,i,j);
+   }
+   }
+   }
+   fclose(fout);
+
+   // print id - table name
+   fout = fopen(tableIdFreq.csv, wt);
+   for (i = 0; i  freqCSset-numCSadded; ++i) {
+   char temp[100], temp2[100];
+   if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore
+   strcpy(temp, labels[i].name);
+   escapeURIforSQL(temp);
+   sprintf(temp2, %s_BUNFMT, temp, freqCSset-items[i].csId); 
// TODO underscores?
+   fprintf(fout, \%d\,\%s\,\%d\\n, i, temp2, 
freqCSset-items[i].support);
+   }
+   fclose(fout);
+
+   fout = fopen(CSmetadata.sql, wt);
+   fprintf(fout, CREATE TABLE table_id_freq (id VARCHAR(10), name 
VARCHAR(100), frequency

MonetDB: rdf - Improve memory footprint of labeling algorithm

2013-07-30 Thread Linnea Passing

Changeset: 2242dea64568 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2242dea64568
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
Branch: rdf
Log Message:

Improve memory footprint of labeling algorithm


diffs (truncated from 776 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -105,15 +105,17 @@ ontology ontologies[] = {
 #if USE_SHORT_NAMES
 /* Extracts the human-readable part of an URI (usually the last token). */
 static
-void getPropNameShort(char* name, char* propStr) {
+void getPropNameShort(char** name, char* propStr) {
char*token;
-   charuri[1000];
+   char*uri;
int length = 0; // number of tokens
char**tokenizedUri = NULL;  // list of tokens
int i, j;
int fit;
 
// tokenize uri
+   uri = (char *) malloc(sizeof(char) * (strlen(propStr) + 1));
+   if (!uri) fprintf(stderr, ERROR: Couldn't malloc memory!\n);
strcpy(uri, propStr); // uri will be modified during tokenization
token = strtok(uri, /#);
while (token != NULL) {
@@ -134,12 +136,20 @@ void getPropNameShort(char* name, char* 
}
if (fit) {
// found matching ontology, create label
+   int totalLength = 0;
for (i = ontologies[j].length; i  length; ++i) 
{
-   strcat(name, tokenizedUri[i]);
-   strcat(name, _); // if label consists 
of =2 tokens, use underscores
+   totalLength += (strlen(tokenizedUri[i]) 
+ 1); // additional char for underscore
+   }
+   (*name) = (char *) malloc(sizeof(char) * 
(totalLength + 1));
+   if (!(*name)) fprintf(stderr, ERROR: Couldn't 
malloc memory!\n);
+   strcpy(*name, \0);
+
+   for (i = ontologies[j].length; i  length; ++i) 
{
+   strcat(*name, tokenizedUri[i]);
+   strcat(*name, _); // if label 
consists of =2 tokens, use underscores
}
// remove trailing underscore
-   name[strlen(name) - 1] = '\0';
+   (*name)[strlen(*name) - 1] = '\0';
 
free(tokenizedUri);
return;
@@ -151,12 +161,17 @@ void getPropNameShort(char* name, char* 
 
if (length == 1) {
// value
-   strcat(name, propStr);
+   (*name) = (char *) malloc(sizeof(char) * (strlen(propStr) + 1));
+   if (!(*name)) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   strcpy(*name, propStr);
} else {
-   strcat(name, tokenizedUri[length - 1]);
+   (*name) = (char *) malloc(sizeof(char) * 
(strlen(tokenizedUri[length - 1]) + 1));
+   if (!(*name)) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   strcpy(*name, tokenizedUri[length - 1]);
}
 
free(tokenizedUri);
+   free(uri);
return;
 }
 #endif
@@ -180,8 +195,8 @@ int** initTypeAttributesHistogramCount(i
 }
 
 static
-TypeAttributesFreq*** initTypeAttributesHistogram(int typeAttributesCount, int 
** typeAttributesHistogramCount, int num) {
-   int i, j, k;
+TypeAttributesFreq*** initTypeAttributesHistogram(int typeAttributesCount, int 
num) {
+   int i, j;
TypeAttributesFreq***   typeAttributesHistogram;
 
typeAttributesHistogram = (TypeAttributesFreq ***) 
malloc(sizeof(TypeAttributesFreq **) * num);
@@ -190,12 +205,7 @@ TypeAttributesFreq*** initTypeAttributes
typeAttributesHistogram[i] = (TypeAttributesFreq **) malloc 
(sizeof(TypeAttributesFreq *) * typeAttributesCount);
if (!typeAttributesHistogram[i]) fprintf(stderr, ERROR: 
Couldn't malloc memory!\n);
for (j = 0; j  typeAttributesCount; ++j) {
-   typeAttributesHistogram[i][j] = (TypeAttributesFreq *) 
malloc (sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[i][j]);
-   if (!typeAttributesHistogram[i][j]) fprintf(stderr, 
ERROR: Couldn't malloc memory!\n);
-   for (k = 0; k  typeAttributesHistogramCount[i][j]; 
++k) {
-   typeAttributesHistogram[i][j][k].freq = 0;
-   typeAttributesHistogram[i][j][k].percent = 0;
-   }
+

MonetDB: rdf - SQL procedure to create a subschema

2013-07-30 Thread Linnea Passing

Changeset: cbde82c8ce68 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cbde82c8ce68
Modified Files:
monetdb5/extras/rdf/rdfretrieval.c
monetdb5/extras/rdf/rdfretrieval.h
monetdb5/extras/rdf/rdfschema.c
sql/backends/monet5/sql.mx
sql/scripts/30_rdf.sql
Branch: rdf
Log Message:

SQL procedure to create a subschema


diffs (truncated from 727 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfretrieval.c 
b/monetdb5/extras/rdf/rdfretrieval.c
--- a/monetdb5/extras/rdf/rdfretrieval.c
+++ b/monetdb5/extras/rdf/rdfretrieval.c
@@ -24,65 +24,25 @@
 #include rdflabels.h
 
 static
-char** initAdjacencyMatrix(int csCount) {
-   char**matrix = NULL; // matrix[from][to]
-   int i, j;
-
-   matrix = (char **) malloc(sizeof(char *) * csCount);
-   if (!matrix) fprintf(stderr, ERROR: Couldn't malloc memory!\n);
-
-   for (i = 0; i  csCount; ++i) {
-   matrix[i] = (char *) malloc(sizeof(char *) * csCount);
-   if (!matrix) fprintf(stderr, ERROR: Couldn't realloc 
memory!\n);
-
-   for (j = 0; j  csCount; ++j) {
-   matrix[i][j] = 0;
-   }
+int edgeExists(long int from, long int to, long int* adjacency_from, long int* 
adjacency_to, int adjacencyCount) {
+   int i;
+   for (i = 0; i  adjacencyCount; ++i) {
+   if (adjacency_from[i] == from  adjacency_to[i] == to) return 
1;
}
-
-   return matrix;
+   return 0;
 }
 
 static
-void createAdjacencyMatrix(char** matrix, CSset* freqCSset, CSmergeRel* 
csRelBetweenMergeFreqSet) {
-   int i, j, k;
-
-   for (i = 0; i  freqCSset-numCSadded; ++i) {
-   if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore
-
-   for (j = 0; j  freqCSset-items[i].numProp; ++j) { // propNo 
in CS order
-   // check foreign key frequency
-   int sum = 0;
-   for (k = 0; k  csRelBetweenMergeFreqSet[i].numRef; 
++k) {
-   if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
freqCSset-items[i].lstProp[j]) {
-   sum += 
csRelBetweenMergeFreqSet[i].lstCnt[k];
-   }
-   }
-
-   for (k = 0; k  csRelBetweenMergeFreqSet[i].numRef; 
++k) { // propNo in CSrel
-   if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
freqCSset-items[i].lstProp[j]) {
-   int to = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
-   if (i == to) continue; // ignore self 
references
-   if ((int) (100.0 * 
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5)  FK_FREQ_THRESHOLD) 
continue; // foreign key is not frequent enough
-   matrix[i][to] = 1;
-   }
-   }   
-   }
-   }
-}
-
-static
-NodeStat* initNodeStats(CSset* freqCSset) {
+NodeStat* initNodeStats1(long int* table_freq, int tableCount) {
NodeStat*   nodeStats = NULL;
int i;
 
-   nodeStats = (NodeStat *) malloc(sizeof(NodeStat) * 
freqCSset-numCSadded);
+   nodeStats = (NodeStat *) malloc(sizeof(NodeStat) * tableCount);
if (!nodeStats) fprintf(stderr, ERROR: Couldn't malloc memory!\n);
 
-   for (i = 0; i  freqCSset-numCSadded; ++i) {
-   if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore
-   nodeStats[i].origWeight = freqCSset-items[i].support;
-   nodeStats[i].weight = freqCSset-items[i].support; // weight = 
origWeight
+   for (i = 0; i  tableCount; ++i) {
+   nodeStats[i].origWeight = table_freq[i];
+   nodeStats[i].weight = table_freq[i]; // weight = origWeight
nodeStats[i].steps = -1;
nodeStats[i].predecessor = -1;
}
@@ -91,30 +51,11 @@ NodeStat* initNodeStats(CSset* freqCSset
 }
 
 static
-NodeStat* initNodeStats23(CSset* freqCSset) {
-   NodeStat*   nodeStats = NULL;
-   int i;
-
-   nodeStats = (NodeStat *) malloc(sizeof(NodeStat) * 
freqCSset-numCSadded);
-   if (!nodeStats) fprintf(stderr, ERROR: Couldn't malloc memory!\n);
-
-   for (i = 0; i  freqCSset-numCSadded; ++i) {
-   if (freqCSset-items[i].parentFreqIdx != -1) continue; // ignore
-   nodeStats[i].origWeight = freqCSset-items[i].support;
-   nodeStats[i].weight = 0;
-   nodeStats[i].steps = -1; // not used
-   nodeStats[i].predecessor = 0; // not used
-   }
-
-   return nodeStats;
-}
-
-static
-void bfs1(int root, CSset* freqCSset, char** adjacencyMatrix, int* queue, int* 
visited, int* isInQueue, int* queuePosition, int* queueLength, NodeStat* 
nodeStats) {
+void bfs1(int root, long

MonetDB: rdf - Schema overview: first version of algorithm to ch...

2013-07-30 Thread Linnea Passing

Changeset: 9a9a115446e0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9a9a115446e0
Modified Files:
monetdb5/extras/rdf/rdfretrieval.c
monetdb5/extras/rdf/rdfretrieval.h
Branch: rdf
Log Message:

Schema overview: first version of algorithm to choose tables that provide an 
overview of the SQL schema


diffs (289 lines):

diff --git a/monetdb5/extras/rdf/rdfretrieval.c 
b/monetdb5/extras/rdf/rdfretrieval.c
--- a/monetdb5/extras/rdf/rdfretrieval.c
+++ b/monetdb5/extras/rdf/rdfretrieval.c
@@ -553,8 +553,257 @@ int* retrieval4(int root, int numNodesMa
return chosenNodes;
 }
 
+static
+char** initEdgesOverview(long int* table_id, int tableCount, long int* 
adjacency_from, long int* adjacency_to, int adjacencyCount) {
+   char**edges;
+   int i, j;
+
+   edges = (char **) malloc(sizeof(char *) * tableCount);
+   if (!edges) fprintf(stderr, ERROR: Couldn't malloc memory!\n);
+
+   for (i = 0; i  tableCount; ++i) {
+   edges[i] = (char *) malloc(sizeof(char) * tableCount);
+   if (!edges[i]) fprintf(stderr, ERROR: Couldn't malloc 
memory!\n);
+   for (j = 0; j  tableCount; ++j) {
+   edges[i][j] = 0;
+   }
+   edges[i][i] = 1; // self-reachability
+   }
+
+   for (i = 0; i  adjacencyCount; ++i) {
+   long int from = adjacency_from[i];
+   long int to = adjacency_to[i];
+   int fromIdx = -1;
+   int toIdx = -1;
+
+   // index lookup
+   for (j = 0; j  tableCount; ++j) {
+   if (table_id[j] == from) {fromIdx = j;}
+   if (table_id[j] == to) {toIdx = j;}
+   if (fromIdx  -1  toIdx  -1) {break;}
+   }
+   assert(fromIdx  -1);
+   assert(toIdx  -1);
+
+   // set edge
+   edges[fromIdx][toIdx] = 1;
+   }
+
+   return edges;
+}
+
+static
+int compareOverviewNodes (const void * a, const void * b) {
+  return ( (*(Node*)b).reachabilityCount - (*(Node*)a).reachabilityCount ); // 
sort descending
+}
+
+static
+int* retrievalOverview(int* numNodesActual, long int* table_id, str* 
table_name, long int* table_freq, int tableCount, long int* adjacency_from, 
long int* adjacency_to, int adjacencyCount) {
+   int i, j, k;
+   char**edges;
+   int sumSubjects = 0;
+   int csCount = 0;
+   int sumChosenSubjects = 0;
+
+   int queue[tableCount]; // cyclic array
+   int isInQueue[tableCount];
+   int queuePosition; // next element in queue to view at
+   int queueLength;
+   charvisited[tableCount];
+   int subgraphSize;
+   Groups  groups;
+   int *chosenNodes = NULL;
+
+   groups.count = 0;
+   groups.groups = NULL;
+
+   edges = initEdgesOverview(table_id, tableCount, adjacency_from, 
adjacency_to, adjacencyCount);
+
+   for (i = 0; i  tableCount; ++i) {
+   visited[i] = 0;
+   }
+
+   // split into disconnected subgraph (ignoring the direction of the 
edges) using BFS
+   while (1) {
+   int root = -1;
+   for (i = 0; i  tableCount; ++i) {
+   if (!visited[i]) {
+   root = i;
+   break;
+   }
+   }
+   if (root == -1) break; // all nodes have been visited, all 
subgraphs have been found
+   // init
+   subgraphSize = 0;
+
+   for (i = 0; i  tableCount; ++i) {
+   queue[i] = -1;
+   isInQueue[i] = 0;
+   }
+
+   // add root node
+   queue[0] = root;
+   queuePosition = 0;
+   queueLength = 1;
+
+   visited[root] = 1;
+   isInQueue[root] = 1;
+
+   // bfs
+   while (queueLength  0) {
+   // dequeue next value
+   int node = queue[queuePosition % tableCount];
+   visited[node] = 1;
+   subgraphSize++;
+   isInQueue[node] = 0;
+   queuePosition += 1;
+   queueLength -= 1;
+
+   // for all adjacent edges
+   for (i = 0; i  tableCount; ++i) {
+   if (visited[i] || isInQueue[i]) continue;
+   if (edges[node][i] || edges[i][node]) {
+   // ignore direction of edge
+
+   // enqueue
+   queue[((queueLength + queuePosition) % 
tableCount)] = i;
+

48 matches

Mail list logo