Changeset: c8adf21bfcde for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c8adf21bfcde
Added Files:
        sql/server/rel_rdfscan.c
        sql/server/rel_rdfscan.h
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
        monetdb5/optimizer/opt_pipes.c
        sql/backends/monet5/rel_bin.c
        sql/backends/monet5/sql.c
        sql/backends/monet5/sql_rdf.h
        sql/backends/monet5/sql_rdf_jgraph.c
        sql/include/sql_relation.h
        sql/server/Makefile.ag
        sql/server/rel_distribute.c
        sql/server/rel_dump.c
        sql/server/rel_optimizer.c
        sql/server/rel_select.c
Branch: rdf
Log Message:

Initial step in creating rdfscan

+ add the computation and usage of rankscore in labeling


diffs (truncated from 970 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -403,6 +403,80 @@ getStringBetweenQuotes(str* out, str in)
        }
 }
 
+#if TYPE_TFIDF_RANKING
+/*
+ * Init the BATs for storing all type oids and their frequency
+ * */
+static 
+void initGlobalTypeBATs(BAT **glTypeValueBat, BAT **glTypeFreqBat){
+
+       *glTypeValueBat = BATnew(TYPE_void, TYPE_oid, smallbatsz, TRANSIENT);
+       BATseqbase(*glTypeValueBat, 0);
+       if (*glTypeValueBat == NULL) {
+               fprintf(stderr, "ERROR: Couldn't create BAT!\n");
+       }
+
+       (void)BAThash(*glTypeValueBat,0);
+       if (!((*glTypeValueBat)->T->hash)){
+               fprintf(stderr, "ERROR: Couldn't create Hash for BAT!\n");      
+       }
+
+       *glTypeFreqBat = BATnew(TYPE_void, TYPE_int, smallbatsz, TRANSIENT);
+       if (*glTypeFreqBat == NULL) {
+               fprintf(stderr, "ERROR: Couldn't create BAT!\n");       
+       }
+
+}
+
+static
+void freeGlobalTypeBATs(BAT *glTypeValueBat, BAT *glTypeFreqBat){
+       BBPunfix(glTypeValueBat->batCacheid);
+       BBPunfix(glTypeFreqBat->batCacheid);
+}
+
+static 
+void addGlobalType(oid typevalue, BAT *glTypeValueBat, BAT *glTypeFreqBat){
+       oid tmp;
+       BUN bun; 
+       int freq; 
+
+       tmp = typevalue; 
+       bun = BUNfnd(glTypeValueBat,(ptr) &tmp);
+       if (bun == BUN_NONE){   //New type value
+               if (glTypeValueBat->T->hash && BATcount(glTypeValueBat) > 4 * 
glTypeValueBat->T->hash->mask) {
+                       HASHdestroy(glTypeValueBat);
+                       BAThash(glTypeValueBat, 2*BATcount(glTypeValueBat));
+               }
+               BUNappend(glTypeValueBat,&tmp, TRUE);   
+               freq = 1; 
+               BUNappend(glTypeFreqBat, &freq, TRUE);
+       } else{
+               int *curfreq = (int *)Tloc(glTypeFreqBat, bun); 
+               (*curfreq)++;  
+       }
+}
+
+static
+int getTypeGlobalFrequency(oid typevalue, BAT *glTypeValueBat, BAT 
*glTypeFreqBat){
+       
+       oid tmp;
+       BUN bun; 
+       int ret = -1; 
+
+       tmp = typevalue; 
+       bun = BUNfnd(glTypeValueBat,(ptr) &tmp);
+       if (bun == BUN_NONE){   //New type value
+               fprintf(stderr, "ERROR: This typevalue must be there!\n");      
+       } else{
+               int *freq = (int *)Tloc(glTypeFreqBat, bun);    
+               ret = *freq;  
+               return ret; 
+       }       
+       return ret; 
+}
+#endif
+
+
 #if USE_TYPE_NAMES
 static
 int compareTypeAttributesFreqs (const void * a, const void * b) {
@@ -413,7 +487,7 @@ int compareTypeAttributesFreqs (const vo
 #if USE_TYPE_NAMES
 /* Add type values to the histogram. Values that are not present in the 
hierarchy tree built from the ontologies are NOT added to the histogram. */
 static
-void insertValuesIntoTypeAttributesHistogram(oid* typeList, int 
typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) {
+void insertValuesIntoTypeAttributesHistogram(oid* typeList, int 
typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat, BAT 
*glTypeValueBat, BAT *glTypeFreqBat) {
        int             i, j;
        int             fit;
        (void) ontmetaBat;
@@ -444,9 +518,19 @@ void insertValuesIntoTypeAttributesHisto
                        
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].value = typeList[i];
                        
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
 - 1].freq = 1;
                }
+
+               //Add to global types
+               #if TYPE_TFIDF_RANKING
+               addGlobalType(typeList[i], glTypeValueBat, glTypeFreqBat); 
+               #else
+               (void) glTypeValueBat; 
+               (void) glTypeFreqBat; 
+               #endif
        }
 }
 
+
+
 /* Loop through all subjects to collect frequency statistics for type 
attribute values. */
 static
 void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter 
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) {
@@ -461,12 +545,21 @@ void createTypeAttributesHistogram(BAT *
        oid             *typeValues; // list of type values per subject and type
        int             typeValuesSize;
        int             typeValuesMaxSize = 10;
+       int             numS = 0; 
 
        // histogram
        int             i, j, k;
 
        oid             *typeAttributesOids = malloc(sizeof(oid) * 
typeAttributesCount);
 
+       BAT             *glTypeValueBat = NULL; //Store the oid of each type 
value 
+       BAT             *glTypeFreqBat = NULL;  //Store the global frequency 
(#of subjects) of a type value 
+
+       #if TYPE_TFIDF_RANKING  
+       int             tmpgl_freq = 0; 
+       initGlobalTypeBATs(&glTypeValueBat, &glTypeFreqBat); 
+       #endif
+
        if (BATcount(sbat) == 0) {
                fprintf(stderr, "sbat must not be empty");
                /* otherwise, variable sbt is not initialized and thus
@@ -517,10 +610,11 @@ void createTypeAttributesHistogram(BAT *
                                        } else {
                                                // analyze values and add to 
histogram
                                                csFreqIdx = 
csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject
-                                               
insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat);
+                                               
insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize, 
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT, 
ontmetaBat, glTypeValueBat,glTypeFreqBat);
                                                typeValuesSize = 0; // reset
                                        }
                                        curS = *sbt;
+                                       numS++; 
                                        curT = i;
                                }
                                // add value to list of type values
@@ -539,7 +633,7 @@ void createTypeAttributesHistogram(BAT *
        // analyze and add last set of typeValues
        if (curS != BUN_NONE && typeValuesSize != 0) {
                csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx 
of last subject
-               insertValuesIntoTypeAttributesHistogram(typeValues, 
typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, 
csFreqIdx, curT, ontmetaBat);
+               insertValuesIntoTypeAttributesHistogram(typeValues, 
typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount, 
csFreqIdx, curT, ontmetaBat, glTypeValueBat,glTypeFreqBat);
        }
 
        GDKfree(typeValues);
@@ -551,18 +645,27 @@ void createTypeAttributesHistogram(BAT *
                }
        }
 
-       // assign percentage
+       (void) numS; 
+       // assign percentage and tf-idf ranking score
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                for (j = 0; j < typeAttributesCount; ++j) {
                        // assign percentage values for every value
                        for (k = 0; k < typeAttributesHistogramCount[i][j]; 
++k) {
                                typeAttributesHistogram[i][j][k].percent = 
(int) (100.0 * typeAttributesHistogram[i][j][k].freq / 
freqCSset->items[i].support + 0.5);
-
+                               #if TYPE_TFIDF_RANKING
+                               tmpgl_freq = 
getTypeGlobalFrequency(typeAttributesHistogram[i][j][k].value, glTypeValueBat, 
glTypeFreqBat); 
+                               typeAttributesHistogram[i][j][k].rankscore = 
((float) typeAttributesHistogram[i][j][k].percent * numS) / (float) tmpgl_freq; 
+                               //printf("numS = %d, oid "BUNFMT", 
typeAttributesHistogram[i][j][k].freq = %d, tmpgl_freq = %d, percent = %d , 
rankscore = %f\n",
+                               //              numS,  
typeAttributesHistogram[i][j][k].value, typeAttributesHistogram[i][j][k].freq, 
tmpgl_freq, typeAttributesHistogram[i][j][k].percent, 
typeAttributesHistogram[i][j][k].rankscore);
+                               #endif
                        }
                }
        }
 
        free(typeAttributesOids);
+       #if TYPE_TFIDF_RANKING
+       freeGlobalTypeBATs(glTypeValueBat, glTypeFreqBat);
+       #endif
 }
 #endif
 
@@ -1316,6 +1419,13 @@ void getTableName(CSlabel* label, CSset*
        char            nameFound = 0;
        oid             maxDepthOid;
        int             maxFreq;
+       
+       #if TYPE_TFIDF_RANKING  
+       oid             maxRankscoreOid; 
+       float           maxRankscore = 0.0; 
+       float           tmprankscore = 0.0; 
+       int             maxRankscoreFreq; 
+       #endif
 
        //for choosing the right type values
        BUN             ontClassPos;
@@ -1339,7 +1449,7 @@ void getTableName(CSlabel* label, CSset*
        label->nameFreq = 0;
        label->ontologySimScore = 0.0;
        #endif
-
+       
        for (i = 0; i < typeAttributesCount; ++i) {
                foundOntologyTypeValue = 0;
                if (typeAttributesHistogramCount[csIdx][i] == 0) continue;
@@ -1373,6 +1483,12 @@ void getTableName(CSlabel* label, CSset*
                // of all values that are >= TYPE_FREQ_THRESHOLD, choose the 
value with the highest hierarchy level ("deepest" value)
                maxDepthOid = typeAttributesHistogram[csIdx][i][0].value;
                maxFreq = typeAttributesHistogram[csIdx][i][0].freq;
+               #if TYPE_TFIDF_RANKING
+               maxRankscore = typeAttributesHistogram[csIdx][i][0].rankscore; 
+               maxRankscoreOid = typeAttributesHistogram[csIdx][i][0].value;
+               maxRankscoreFreq = typeAttributesHistogram[csIdx][i][0].freq;
+               #endif
+
                ontClassPos = BUNfnd(ontmetaBat, &maxDepthOid);
                if ( ontClassPos != BUN_NONE){
                        foundOntologyTypeValue = 1;
@@ -1405,6 +1521,14 @@ void getTableName(CSlabel* label, CSset*
                                        maxFreq = freq;
                                }
 
+                               #if TYPE_TFIDF_RANKING
+                               tmprankscore = 
typeAttributesHistogram[csIdx][i][j].rankscore; 
+                               if (tmprankscore > maxRankscore){
+                                       maxRankscore = tmprankscore; 
+                                       maxRankscoreOid = 
typeAttributesHistogram[csIdx][i][j].value;
+                                       maxRankscoreFreq = 
typeAttributesHistogram[csIdx][i][j].freq;   
+                               }
+                               #endif
                        }
                }
 
@@ -1414,6 +1538,10 @@ void getTableName(CSlabel* label, CSset*
                if (foundOntologyTypeValue){
                        choosenOntologyTypeValue = maxDepthOid;
                        choosenFreq = maxFreq;
+                       #if TYPE_TFIDF_RANKING
+                       choosenOntologyTypeValue = maxRankscoreOid; 
+                       choosenFreq = maxRankscoreFreq; 
+                       #endif
                }
        }
 
@@ -2027,6 +2155,7 @@ CSlabel* createLabels(CSset* freqCSset, 
         clock_t                tmpLastT;
 
 
+
        str             schema = "rdf";
        int             ret;
 
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -28,6 +28,7 @@ typedef struct TypeAttributesFreq {
        oid             value;
        int             freq;
        int             percent;
+       float           rankscore;      //= percent / global
 } TypeAttributesFreq;
 
 // Statistics for a foreign key relationship
@@ -106,6 +107,7 @@ enum {
 #define        ONLY_USE_ONTOLOGYBASED_TYPE 0
 #define USE_BEST_TYPEVALUE_INSTEADOF_DUMMY 1  //Use the most frequent type 
value instead of a dummy for the label name 
 #define MIN_POSSIBLE_TYPE_FREQ_THRESHOLD  20  //However, that type must still 
appears in more than a minimum threshold
+#define TYPE_TFIDF_RANKING 1   //Rank value of type property by using (percent 
in a CS) / (percent in all subjects)
 
 rdf_export void
 getPropNameShort(char** name, char* propStr);
diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c
--- a/monetdb5/optimizer/opt_pipes.c
+++ b/monetdb5/optimizer/opt_pipes.c
@@ -116,6 +116,31 @@ static struct PIPELINES {
         "optimizer.generator();"
         "optimizer.garbageCollector();",
         "stable", NULL, NULL, 1},
+/* The rdf_opt_pipe is identical to the no_mitosis_pipe
+ * which is used for rdf/sparql queries
+ */
+       {"rdf_opt_pipe",
+        "optimizer.inline();"
+        "optimizer.remap();"
+        "optimizer.costModel();"
+        "optimizer.coercions();"
+        "optimizer.evaluate();"
+        "optimizer.aliases();"
+        "optimizer.pushselect();"
+        "optimizer.mergetable();"
+        "optimizer.deadcode();"
+        "optimizer.commonTerms();"
+        "optimizer.joinPath();"
+        "optimizer.reorder();"
+        "optimizer.deadcode();"
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to