Changeset: 20d17afb3ae1 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=20d17afb3ae1 Modified Files: monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Replace several parameters by a single param diffs (128 lines): diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -503,7 +503,7 @@ void getOrigRefCount(CSrel *csrelSet, CS for (j = 0; j < csrelSet[i].numRef; j++){ freqId = csrelSet[i].lstRefFreqIdx[j]; #if FILTER_INFREQ_FK_FOR_IR - if (csrelSet[i].lstCnt[j] < FILTER_THRESHOLD_FK_FOR_IR * freqCSset->items[freqId].support) continue; + if (csrelSet[i].lstCnt[j] < INFREQ_TYPE_THRESHOLD * freqCSset->items[freqId].support) continue; #endif //Do not count the self-reference if (freqId != i) refCount[freqId] += csrelSet[i].lstCnt[j]; @@ -536,7 +536,7 @@ void getIRNums(CSrel *csrelSet, CSset *f for (j = 0; j < csrelSet[i].numRef; j++){ freqId = csrelSet[i].lstRefFreqIdx[j]; #if FILTER_INFREQ_FK_FOR_IR - if (csrelSet[i].lstCnt[j] < FILTER_THRESHOLD_FK_FOR_IR * freqCSset->items[freqId].support) continue; + if (csrelSet[i].lstCnt[j] < INFREQ_TYPE_THRESHOLD * freqCSset->items[freqId].support) continue; #endif if (freqId != i){ //Do not count the self-reference //curIRScores[freqId] += (lastIRScores[i] * (float)csrelSet[i].lstCnt[j]/(float)refCount[freqId]) + csrelSet[i].lstCnt[j]; @@ -867,7 +867,7 @@ char isMultiValueCol(PropTypes pt){ tmpRatio = ((double)pt.propCover / (pt.numSingleType + pt.numMVType)); //printf("NumMVType = %d | Ratio %f \n", pt.numMVType, tmpRatio); - if ((pt.numMVType > 0) && (tmpRatio > IS_MULVALUE_THRESHOLD)){ + if ((pt.numMVType > 0) && (tmpRatio > (1 + INFREQ_TYPE_THRESHOLD))){ return 1; } else return 0; @@ -3603,7 +3603,7 @@ void generatecsRelSum(CSrel csRel, int f freq = freqCSset->items[csRel.origFreqIdx].support; referredFreqId = csRel.lstRefFreqIdx[i]; freqOfReferredCS = freqCSset->items[referredFreqId].support; - if (freq > MIN_FROMTABLE_SIZE_S5 && freq < csRel.lstCnt[i] * MIN_PERCETAGE_S5 + if (freq > MIN_FROMTABLE_SIZE_S5 && (((float)freq * INFREQ_TYPE_THRESHOLD) < csRel.lstCnt[i])) && freqOfReferredCS < csRel.lstCnt[i] * MIN_TO_PERCETAGE_S5){ p = csRel.lstPropId[i]; @@ -8437,7 +8437,7 @@ CSrel* getFKBetweenTableSet(CSrel *csrel // add relation to new data structure //Compare with prop coverage from csproptype - if (rel.lstCnt[j] < freqCSset->items[toFreqId].support * MIN_FK_FREQUENCY) continue; + if (rel.lstCnt[j] < freqCSset->items[toFreqId].support * INFREQ_TYPE_THRESHOLD) continue; to = mfreqIdxTblIdxMapping[toFreqId]; assert(to != -1); @@ -8455,7 +8455,7 @@ CSrel* getFKBetweenTableSet(CSrel *csrel //Filtering: For big size table, if large number of prop's instances need to refer to a certain table // else, all instances of that prop must refer to the certain table if (freqCSset->items[i].coverage > MINIMUM_TABLE_SIZE){ - if (csPropTypes[from].lstPropTypes[propIdx].propCover * MIN_FK_PROPCOVERAGE > rel.lstCnt[j]) continue; + if (csPropTypes[from].lstPropTypes[propIdx].propCover * (1 - INFREQ_TYPE_THRESHOLD) > rel.lstCnt[j]) continue; else if (csPropTypes[from].lstPropTypes[propIdx].propCover == rel.lstCnt[j]) csPropTypes[from].lstPropTypes[propIdx].isDirtyFKProp = 0; else diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h --- a/monetdb5/extras/rdf/rdfschema.h +++ b/monetdb5/extras/rdf/rdfschema.h @@ -127,10 +127,10 @@ typedef struct PropStat { #define OUTPUT_FREQID_PER_LABEL 1 /* This is for evaluating the results of merging using S1. TODO: Set it to 0 for default*/ #define MERGING_CONSIDER_NAMEORIGINALITY 0 /*Merging in rule S1, considering where the name comes from (e.g., from Ontology, from rdf:type, or from FK) */ -#define IS_MULVALUE_THRESHOLD 1.1 /* The ratio betweeen (the number of triple coverred by Prop P) / (number of Non-NULL object values for P) - If this ratio is ~1, only use single value column for that prop - */ -#define INFREQ_TYPE_THRESHOLD 0.01 /* Threshold that a type is consider as an infrequent type */ +//#define IS_MULVALUE_THRESHOLD 1.1 //The ratio betweeen (the number of triple coverred by Prop P) / (number of Non-NULL object values for P) + // If this ratio is ~1, only use single value column for that prop + // Replaced by ( 1 + INFREQ_TYPE_THRESHOLD) as multi-prop can be considered as the type of the props +#define INFREQ_TYPE_THRESHOLD 0.1 /* Threshold that a type is consider as an infrequent type */ @@ -154,7 +154,8 @@ typedef struct PropStat { #define ONLY_MERGE_URINAME_CS_S1 0 /* Only merge CS's whose name is an URI */ #define FILTER_INFREQ_FK_FOR_IR 1 /* We filter out all the dirty references from a CS */ -#define FILTER_THRESHOLD_FK_FOR_IR 0.1 /* The FK that their frequency < FILTER_THRESHOLD_FK_FOR_IR * FreqCS's frequency */ +//#define FILTER_THRESHOLD_FK_FOR_IR 0.1 /* The FK that their frequency < FILTER_THRESHOLD_FK_FOR_IR * FreqCS's frequency */ +// //Replaced by INFREQ_TYPE_THRESHOLD as a reference can be considered as a type of the object value /*------------------------------------*/ @@ -245,14 +246,15 @@ typedef struct SubCSSet{ int numAllocation; } SubCSSet; -//#define INIT_NUM_CS 9999 // workaround -#define INIT_NUM_CS 1000 // workaround +#define INIT_NUM_CS 1000 #define SIM_THRESHOLD 0.6 #define SIM_TFIDF_THRESHOLD 0.75 #define IMPORTANCE_THRESHOLD 0.001 //This is used when merging CS's by common ancestor #define COMMON_ANCESTOR_LOWEST_SPECIFIC_LEVEL 2 -#define MIN_PERCETAGE_S5 5 // Merge all CS refered by more than 1/MIN_PERCETAGE_S6 percent of a CS via one property +//#define MIN_PERCETAGE_S5 5 // Merge all CS refered by more than 1/MIN_PERCETAGE_S6 percent of a CS via one property + // Replaced by using INFREQ_TYPE_THRESHOLD + // #define MIN_FROMTABLE_SIZE_S5 100 // The minimum size of the "from" table in S6. Meaning that // the CS's to-be-merged in this rule must cover > MIN_FROMTABLE_SIZE_S6 / MIN_PERCETAGE_S6 triples #define MIN_TO_PERCETAGE_S5 10 // Threshold for the number of instances in the target CS refered by the property @@ -271,10 +273,8 @@ typedef struct SubCSSet{ //#define MINIMUM_TABLE_SIZE 1 // For example dataset only #define HIGH_REFER_THRESHOLD 5 -//#define INFREQ_PROP_THRESHOLD 0.01 #define INFREQ_PROP_THRESHOLD 0.05 -//#define INFREQ_PROP_THRESHOLD 0.2 //For Testing #define REMOVE_INFREQ_PROP 1 #define REMOVE_LOTSOFNULL_SUBJECT 1 #define LOTSOFNULL_SUBJECT_THRESHOLD 0.1 @@ -287,8 +287,10 @@ typedef struct SubCSSet{ //contain small table #define STRANGE_PROP_FREQUENCY 10 //If the prop appears in less than 3 instances, it may be the black sheep -#define MIN_FK_FREQUENCY 0.1 // The frequency of a FK should be > MIN_FK_FREQUENCY * The frequency of a mergedCS (or the number of tuples in one table) -#define MIN_FK_PROPCOVERAGE 0.9 // The FK needs to happen in MIN_FK_PROPCOVERAGE of all instances of the particular property +//#define MIN_FK_FREQUENCY 0.1 // The frequency of a FK should be > MIN_FK_FREQUENCY * The frequency of a mergedCS (or the number of tuples in one table) + // Replaced by INFREQ_TYPE_THRESHOLD +//#define MIN_FK_PROPCOVERAGE 0.9 // The FK needs to happen in MIN_FK_PROPCOVERAGE of all instances of the particular property + // Replaced by (1 - INFREQ_TYPE_THRESHOLD) #define EXPORT_LABEL 1 /* Export labels: TODO: */ _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list