Changeset: 3e4ece2b7085 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3e4ece2b7085
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:

Improve label quality

- Computation of similarity between CS's and classes is now based on the 
assumption that all properties of a CS should belong to one ontology class, not 
that the CS has to consist of ALL properties of the corresponding ontology 
class.
- Type values are usually multi-valued properties, the values represent the 
hierarchy the subject belongs to (e.g., if a subject in the dbpedia dataset has 
type 'Athlete', it also has types 'Person', 'Agent', 'Thing'). This hierarchy 
is analyzed and only the most specific type value (the "leaf") is added to the 
data structures. This improves the label candidates that are computed using 
type values.


diffs (284 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -851,19 +851,89 @@ int compareTypeAttributesFreqs (const vo
 #endif
 
 #if USE_TYPE_NAMES
+/* Analyze hierarchy in a list of type values, add all leaf values to the 
histogram. Values that are not present in the hierarchy tree built from the 
ontologies are NOT added to the histogram. */
+static
+void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength, 
TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, 
OntClass *ontclassSet) {
+       int             i, j, k;
+       int             fit;
+       char            *leaf; // flag whether a type value in 'typeList' is a 
leaf (1) or not (0)
+       BUN             pos;
+       OntClass        hierarchy;
+
+       // start with: every type value is a leaf
+       leaf = GDKmalloc(sizeof(char) * typeListLength);
+       for (i = 0; i < typeListLength; ++i) leaf[i] = 1;
+
+       // analyze hierarchy
+       for (i = 0; i < typeListLength; ++i) {
+               if (!leaf[i]) continue;
+               pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
+               if (pos == BUN_NONE) {
+                       // no ontology information for this type value, 
therefore it is not added to the hierarchy
+                       leaf[i] = 0;
+                       continue;
+               }
+
+               // get hierarchy of this type value
+               hierarchy = ontclassSet[pos];
+
+               // loop over superclasses, set leaf=0
+               for (j = 0; j < hierarchy.numsc; ++j) {
+                       for (k = 0; k < typeListLength; ++k) {
+                               if (i == k) continue;
+                               if (ontclassSet[hierarchy.scIdxes[j]].cOid == 
typeList[k]) {
+                                       // found superclass at position 'k'
+                                       leaf[k] = 0;
+                               }
+                       }
+               }
+       }
+
+       // add all leafs to the histogram
+       for (i = 0; i < typeListLength; ++i) {
+               if (!leaf[i]) continue;
+               fit = 0;
+               for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; 
++j) {
+                       if (typeAttributesHistogram[csFreqIdx][type][j].value 
== typeList[i]) {
+                               // bucket exists
+                               
typeAttributesHistogram[csFreqIdx][type][j].freq += 1;
+                               fit = 1;
+                               break;
+                       }
+               }
+               if (!fit) {
+                       // bucket does not exist
+                       // realloc
+                       typeAttributesHistogramCount[csFreqIdx][type] += 1;
+                       typeAttributesHistogram[csFreqIdx][type] = 
(TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][type], 
sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][type]);
+                       if (!typeAttributesHistogram[csFreqIdx][type]) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+
+                       // insert value
+                       
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].value = typeList[i];
+                       
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].freq = 1;
+               }
+       }
+
+       GDKfree(leaf);
+}
+
 /* Loop through all subjects to collect frequency statistics for type 
attribute values. */
 static
-void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes) {
+void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass 
*ontclassSet) {
        // looping, extracting
        BUN             p, q;
        oid             *sbt, *obt, *pbt;
        char            objType;
        oid             objOid;
        int             csFreqIdx;
+       oid             curS; // last subject
+       int             curT; // last type (index in 'typeAttributes' array)
+       oid             *typeValues; // list of type values per subject and type
+       int             typeValuesSize;
+       int             typeValuesMaxSize = 10;
 
        // histogram
        int             i, j, k;
-       int             fit;
 
        oid             *typeAttributesOids = malloc(sizeof(oid) * 
typeAttributesCount);
 
@@ -878,6 +948,11 @@ void createTypeAttributesHistogram(BAT *
                TKNZRappend(&typeAttributesOids[i], &typeAttributes[i]);
        }
 
+       curS = BUN_NONE;
+       curT = -1;
+       typeValues = GDKmalloc(sizeof(oid) * typeValuesMaxSize);
+       if (!typeValues) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+       typeValuesSize = 0;
        BATloop(sbat, p, q) {
                // Get data
                sbt = (oid *) BUNtloc(si, p);
@@ -907,32 +982,36 @@ void createTypeAttributesHistogram(BAT *
                                        objOid = objOid - (objType*2 + 1) *  
RDF_MIN_LITERAL;   /* Get the real objOid from Map or Tokenizer */
                                }
 
-                               // add object to histogram
-                               fit = 0;
-                               for (j = 0; j < 
typeAttributesHistogramCount[csFreqIdx][i]; ++j) {
-                                       if 
(typeAttributesHistogram[csFreqIdx][i][j].value == objOid) {
-                                               // bucket exists
-                                               
typeAttributesHistogram[csFreqIdx][i][j].freq += 1;
-                                               fit = 1;
-                                               break;
+                               // if finished looping over one subject or 
type, the list of type values is analyzed and added to the histogram
+                               if (curS != *sbt || curT != i) {
+                                       if (curS == BUN_NONE || typeValuesSize 
== 0) {
+                                               // nothing to add to histogram
+                                       } else {
+                                               // analyze values and add to 
histogram
+                                               
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
+                                               typeValuesSize = 0; // reset
                                        }
+                                       curS = *sbt;
+                                       curT = i;
                                }
-                               if (!fit) {
-                                       // bucket does not exist
-                                       // realloc
-                                       
typeAttributesHistogramCount[csFreqIdx][i] += 1;
-                                       typeAttributesHistogram[csFreqIdx][i] = 
(TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][i], 
sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][i]);
-                                       if 
(!typeAttributesHistogram[csFreqIdx][i]) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
-
-                                       // insert value
-                                       
typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i]
 - 1].value = objOid;
-                                       
typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i]
 - 1].freq = 1;
+                               // add value to list of type values
+                               if (typeValuesSize == typeValuesMaxSize) {
+                                       // resize
+                                       typeValuesMaxSize *= 2;
+                                       typeValues = GDKrealloc(typeValues, 
sizeof(oid) * typeValuesMaxSize);
+                                       if (!typeValues) fprintf(stderr, 
"ERROR: Couldn't realloc memory!\n");
                                }
+                               typeValues[typeValuesSize++] = *obt;
                                break;
                        }
                }
        }
 
+       // analyze and add last set of typeValues
+       if (curS != BUN_NONE && typeValuesSize != 0) 
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, ontclassSet);
+
+       GDKfree(typeValues);
+
        // sort descending by frequency
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                for (j = 0; j < typeAttributesCount; ++j) {
@@ -1094,7 +1173,7 @@ int compareOntologyCandidates (const voi
 #if USE_ONTOLOGY_NAMES
 /* For one CS: Calculate the ontology classes that are similar (tfidf) to the 
list of attributes. */
 static
-oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** 
ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int 
*listCount, int listNum, PropStat *propStat, float *totaltfidfsPerOntology) {
+oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** 
ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int 
*listCount, int listNum, PropStat *propStat) {
        int             i, j, k, l;
        oid             *result = NULL;
 
@@ -1147,6 +1226,7 @@ oid* getOntologyCandidates(oid** ontattr
                        BUN p, bun;
                        p = listOids[i][j];
                        bun = BUNfnd(BATmirror(propStat->pBat), (ptr) &p);
+                       if (bun == BUN_NONE) continue; // property does not 
belong to an ontology class and therefore has no tfidfs score
                        for (k = 0; k < candidatesCount[j]; ++k) { // for each 
candidate
                                // search for this class
                                int found = 0;
@@ -1169,21 +1249,6 @@ oid* getOntologyCandidates(oid** ontattr
                                }
                        }
                }
-               
-               //[DUC --- add the total tfidf score for a ontology class]  
//TODO: Compute before, not here
-               for (l = 0; l < num; ++l){
-                       for (j = 0; j < ontmetadataCount; ++j) {
-                               oid auri = ontmetadata[0][j];
-                               //printf("auri = " BUNFMT "\n", auri);
-                               if (auri == classStat[l].ontoClass){
-                                       //printf("Classstat %d (uri: "BUNFMT ") 
- Set totaltfidf with ontology %dth: %f \n", l, auri, j, 
totaltfidfsPerOntology[j]); 
-                                       classStat[l].totaltfidfs = 
totaltfidfsPerOntology[j]; 
-                                       break; 
-                               }
-                       }
-               }
-               //[ ... DUC]
-
 
                // calculate optimal tfidf score (all properties) & normalize 
tfidf sums
                totalTfidfs = 0.0;
@@ -1194,11 +1259,7 @@ oid* getOntologyCandidates(oid** ontattr
                        totalTfidfs += (propStat->tfidfs[bun] * 
propStat->tfidfs[bun]);
                }
                for (j = 0; j < num; ++j) {
-                       //classStat[j].tfidfs /= totalTfidfs;  //[DUC--modify]
-                       //printf("original classStat[j].tfidfs = %f \n", 
classStat[j].tfidfs);
-                       classStat[j].tfidfs = classStat[j].tfidfs / 
(sqrt(totalTfidfs)*sqrt(classStat[j].totaltfidfs));
-                       //printf("totalTfidfs = %f    || 
classStat[j].totaltfidfs =  %f || classStat[j].tfidfs = %f 
\n",totalTfidfs,classStat[j].totaltfidfs,classStat[j].tfidfs);
-                       
+                       classStat[j].tfidfs /= totalTfidfs;
                }
 
                // sort by tfidf desc
@@ -1408,8 +1469,6 @@ static
 void createOntologyLookupResult(oid** result, CSset* freqCSset, int* 
resultCount, oid** ontattributes, int ontattributesCount, oid** ontmetadata, 
int ontmetadataCount) {
        int             i, j;
        PropStat        *propStat;
-       float*          totaltfidfsPerOntology;         //[DUC]
-       oid             lastUri; 
 
        propStat = initPropStat();
 
@@ -1417,34 +1476,6 @@ void createOntologyLookupResult(oid** re
        // Not the properties from freqCS
        //createPropStatistics(propStat, freqCSset->numCSadded, freqCSset);
        createPropStatistics(propStat, ontattributes, ontattributesCount);
-       
-
-       lastUri = BUN_NONE; 
-       totaltfidfsPerOntology = (float*) malloc(sizeof(float) * 
ontmetadataCount);
-       //printf("Init tfidf for all %d ontologies \n",ontmetadataCount );
-       for (i = 0; i < ontmetadataCount; ++i) {
-               oid auri = ontmetadata[0][i];
-
-               if (auri == lastUri){ 
-                       //printf("Duplication at %d value " BUNFMT "\n", i, 
auri); 
-                       continue; 
-               }
-               else lastUri = auri; 
-               totaltfidfsPerOntology[i] = 0; 
-
-               for (j = 0; j < ontattributesCount; j++){
-                       oid tmpuri = ontattributes[0][j];
-                       oid aattr = ontattributes[1][j];
-                       if (auri == tmpuri){
-                               BUN bun = BUNfnd(BATmirror(propStat->pBat), 
(ptr) &aattr);
-                               if (bun == BUN_NONE) printf("[Debug] This 
cannot happen \n");
-                               else
-                                       totaltfidfsPerOntology[i] += 
(propStat->tfidfs[bun] * propStat->tfidfs[bun]);
-                       }       
-               }
-               //printf("Computed totaltfidfsPerOntology of ontology %d: %f 
(uri = "BUNFMT")\n",i, totaltfidfsPerOntology[i],auri);
-       }
-       //... [DUC]
 
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                CS              cs;
@@ -1475,7 +1506,7 @@ void createOntologyLookupResult(oid** re
 
                // get class names
                resultCount[i] = 0;
-               result[i] = getOntologyCandidates(ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), 
propOntologiesOids, propOntologiesCount, ontologyCount, 
propStat,totaltfidfsPerOntology);
+               result[i] = getOntologyCandidates(ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), 
propOntologiesOids, propOntologiesCount, ontologyCount, propStat);
 
                for (j = 0; j < ontologyCount; ++j) {
                        free(propOntologies[j]);
@@ -1486,7 +1517,6 @@ void createOntologyLookupResult(oid** re
                free(propOntologiesCount);
        }
        freePropStat(propStat);
-       free(totaltfidfsPerOntology);
 }
 #endif
 
@@ -2703,7 +2733,7 @@ CSlabel* createLabels(CSset* freqCSset, 
        typeAttributesHistogramCount = 
initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded);
        typeAttributesHistogram = 
initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded);
 #if USE_TYPE_NAMES
-       createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, 
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, 
typeAttributesHistogramCount, typeAttributes);
+       createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset, 
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram, 
typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet);
        typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded, 
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
 #else
        (void) sbat;
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to