Attached is a patch for raptor svn r15635 (also works with the 1.4.19
release) which addresses a scalability problem in Turtle serialization
by replacing raptor_sequence with raptor_avltree for the subject and
blanks containers passed to raptor_abbrev_node_lookup. This attempts
to fix the "this should really be a hash, not a list" FIXME previously
noted in that function.
The tricky bit is handling the way blank nodes were removed from the
sequence when written, if they were not going to be needed again.
Because you can't just replace an item in the tree with NULL (as was
done in the sequence) without breaking tree ordering, and you can't
remove an item from the tree while iterating over it, I've instead
added a "valid" flag to the subject struct itself which is reset on
writing and subsequently tested to prevent duplicate writes. I'm not
hugely keen on this, should anyone have any better ideas.
This patch reduces the runtime of my own test case (c. 400K triples
constructed and serialized) from about 25 minutes to about 14 seconds.
It also reduces the runtime for the Turtle test suite from about 12.1
to 10.4 seconds on this machine in release trim, and it passes
valgrind --leak-check=full with no errors or leaks.
The bad news is that it causes a number of unit tests to fail because
of changes to the ordering in output. One test (ex-38) in the rdfxml
and turtle test suites fails (I think because rdfdiff is wrongly
seeing a difference that isn't there), and the entire feeds test suite
fails (I think because it doesn't use rdfdiff at all). I haven't
spotted anything that looks like a "real" failure, but I might be
missing something. Thoughts welcome.
Chris
Index: src/raptor_internal.h
===================================================================
--- src/raptor_internal.h (revision 15635)
+++ src/raptor_internal.h (working copy)
@@ -1199,11 +1199,13 @@
raptor_abbrev_node* node; /* node representing the subject of
* this resource */
raptor_abbrev_node* node_type; /* the rdf:type of this resource */
- raptor_avltree *properties; /* list of properties
+ raptor_avltree *properties; /* list of properties
* (predicate/object pair) of this
* subject */
raptor_sequence *list_items; /* list of container elements if
* is rdf container */
+ int valid; /* set 0 for blank nodes that do not
+ * need to be referred to again */
} raptor_abbrev_subject;
@@ -1211,15 +1213,17 @@
void raptor_free_abbrev_node(raptor_abbrev_node* node);
int raptor_abbrev_node_cmp(raptor_abbrev_node* node1, raptor_abbrev_node* node2);
int raptor_abbrev_node_equals(raptor_abbrev_node* node1, raptor_abbrev_node* node2);
-int raptor_abbrev_node_matches(raptor_abbrev_node* node, raptor_identifier_type node_type, const void *node_data, raptor_uri *datatype, const unsigned char *language);
raptor_abbrev_node* raptor_abbrev_node_lookup(raptor_avltree* nodes, raptor_identifier_type node_type, const void *node_value, raptor_uri *datatype, const unsigned char *language, int* created_p);
raptor_abbrev_subject* raptor_new_abbrev_subject(raptor_abbrev_node* node);
void raptor_free_abbrev_subject(raptor_abbrev_subject* subject);
int raptor_abbrev_subject_add_property(raptor_abbrev_subject* subject, raptor_abbrev_node* predicate, raptor_abbrev_node* object);
int raptor_abbrev_subject_add_list_element(raptor_abbrev_subject* subject, int ordinal, raptor_abbrev_node* object);
-raptor_abbrev_subject* raptor_abbrev_subject_find(raptor_sequence *sequence, raptor_identifier_type node_type, const void *node_data, int *idx);
-raptor_abbrev_subject* raptor_abbrev_subject_lookup(raptor_avltree* nodes, raptor_sequence* subjects, raptor_sequence* blanks, raptor_identifier_type node_type, const void *node_data, int* created_p);
+int raptor_abbrev_subject_cmp(raptor_abbrev_subject* subject1, raptor_abbrev_subject* subject2);
+raptor_abbrev_subject* raptor_abbrev_subject_find(raptor_avltree *subjects, raptor_identifier_type node_type, const void *node_data);
+raptor_abbrev_subject* raptor_abbrev_subject_lookup(raptor_avltree* nodes, raptor_avltree* subjects, raptor_avltree* blanks, raptor_identifier_type node_type, const void *node_data, int* created_p);
+int raptor_abbrev_subject_valid(raptor_abbrev_subject *subject);
+int raptor_abbrev_subject_invalidate(raptor_abbrev_subject *subject);
unsigned char *raptor_unique_id(unsigned char *base);
Index: src/raptor_serialize_turtle.c
===================================================================
--- src/raptor_serialize_turtle.c (revision 15635)
+++ src/raptor_serialize_turtle.c (working copy)
@@ -59,8 +59,8 @@
raptor_namespace *rdf_nspace; /* the rdf: namespace */
raptor_turtle_writer *turtle_writer; /* where the xml is being written */
raptor_sequence *namespaces; /* User declared namespaces */
- raptor_sequence *subjects; /* subject items */
- raptor_sequence *blanks; /* blank subject items */
+ raptor_avltree *subjects; /* subject items */
+ raptor_avltree *blanks; /* blank subject items */
raptor_avltree *nodes; /* nodes */
raptor_abbrev_node *rdf_type; /* rdf:type uri */
@@ -277,15 +277,14 @@
/* If this is only used as a 1 subject and object or never
* used as a subject or never used as an object, it never need
* be referenced with an explicit name */
- int idx;
raptor_abbrev_subject* blank;
blank = raptor_abbrev_subject_find(context->blanks,
node->type,
- node->value.blank.string, &idx);
+ node->value.blank.string);
if(blank) {
rc = raptor_turtle_emit_subject(serializer, blank, depth+1);
- raptor_sequence_set_at(context->blanks, idx, NULL);
+ raptor_abbrev_subject_invalidate(blank);
}
} else {
@@ -319,7 +318,7 @@
{
int rv = 0;
int i=0;
-
+
RAPTOR_DEBUG5("Emitting subject list items for node %p refcount %d subject %d object %d\n",
subject->node,
subject->node->ref_count, subject->node->count_as_subject,
@@ -384,7 +383,6 @@
{
raptor_turtle_context* context=(raptor_turtle_context*)serializer->context;
int rv = 0;
- int idx;
raptor_avltree_iterator* iter=NULL;
int i;
int is_new_subject = 0;
@@ -470,7 +468,7 @@
if(object->type == RAPTOR_IDENTIFIER_TYPE_ANONYMOUS) {
subject = raptor_abbrev_subject_find(context->blanks, object->type,
- object->value.blank.string, &idx);
+ object->value.blank.string);
if(!subject) {
raptor_serializer_error(serializer,
@@ -647,6 +645,8 @@
int collection = 0;
int rc = 0;
+ if (!raptor_abbrev_subject_valid(subject)) return 0;
+
RAPTOR_DEBUG5("Emitting subject node %p refcount %d subject %d object %d\n",
subject->node,
subject->node->ref_count,
@@ -796,28 +796,36 @@
raptor_turtle_context* context=(raptor_turtle_context*)serializer->context;
raptor_abbrev_subject* subject;
raptor_abbrev_subject* blank;
- int i;
int rc;
-
- for(i=0; i < raptor_sequence_size(context->subjects); i++) {
- subject = (raptor_abbrev_subject* )raptor_sequence_get_at(context->subjects, i);
- if(subject) {
- rc= raptor_turtle_emit_subject(serializer, subject, 0);
- if(rc)
+ raptor_avltree_iterator* iter=NULL;
+
+ iter = raptor_new_avltree_iterator(context->subjects, NULL, NULL, 1);
+ while (iter) {
+ subject = (raptor_abbrev_subject *)raptor_avltree_iterator_get(iter);
+ if (subject) {
+ rc = raptor_turtle_emit_subject(serializer, subject, 0);
+ if (rc) {
return rc;
+ }
}
+ if (raptor_avltree_iterator_next(iter)) break;
}
-
- /* Emit any remaining blank nodes */
- for(i=0; i < raptor_sequence_size(context->blanks); i++) {
- blank = (raptor_abbrev_subject* )raptor_sequence_get_at(context->blanks, i);
- if(blank) {
- rc= raptor_turtle_emit_subject(serializer, blank, 0);
- if(rc)
+ if (iter) raptor_free_avltree_iterator(iter);
+
+ /* Emit any remaining blank nodes. */
+ iter = raptor_new_avltree_iterator(context->blanks, NULL, NULL, 1);
+ while (iter) {
+ blank = (raptor_abbrev_subject *)raptor_avltree_iterator_get(iter);
+ if (blank) {
+ rc = raptor_turtle_emit_subject(serializer, blank, 0);
+ if (rc) {
return rc;
+ }
}
+ if (raptor_avltree_iterator_next(iter)) break;
}
-
+ if (iter) raptor_free_avltree_iterator(iter);
+
return 0;
}
@@ -848,10 +856,14 @@
context->namespaces=raptor_new_sequence(NULL, NULL);
context->subjects =
- raptor_new_sequence((raptor_sequence_free_handler *)raptor_free_abbrev_subject,NULL);
+ raptor_new_avltree(serializer->world,
+ (raptor_data_compare_function)raptor_abbrev_subject_cmp,
+ (raptor_data_free_function)raptor_free_abbrev_subject, 0);
context->blanks =
- raptor_new_sequence((raptor_sequence_free_handler *)raptor_free_abbrev_subject,NULL);
+ raptor_new_avltree(serializer->world,
+ (raptor_data_compare_function)raptor_abbrev_subject_cmp,
+ (raptor_data_free_function)raptor_free_abbrev_subject, 0);
context->nodes =
raptor_new_avltree(serializer->world,
@@ -922,12 +934,12 @@
}
if(context->subjects) {
- raptor_free_sequence(context->subjects);
+ raptor_free_avltree(context->subjects);
context->subjects=NULL;
}
if(context->blanks) {
- raptor_free_sequence(context->blanks);
+ raptor_free_avltree(context->blanks);
context->blanks=NULL;
}
@@ -1112,8 +1124,9 @@
statement->subject_type,
statement->subject,
&subject_created);
- if(!subject)
+ if(!subject) {
return 1;
+ }
object_type=statement->object_type;
if(object_type == RAPTOR_IDENTIFIER_TYPE_LITERAL) {
Index: src/raptor_serialize_rdfxmla.c
===================================================================
--- src/raptor_serialize_rdfxmla.c (revision 15635)
+++ src/raptor_serialize_rdfxmla.c (working copy)
@@ -63,9 +63,9 @@
raptor_xml_element* rdf_RDF_element; /* the rdf:RDF element */
raptor_xml_writer *xml_writer; /* where the xml is being written */
raptor_sequence *namespaces; /* User declared namespaces */
- raptor_sequence *subjects; /* subject items */
- raptor_sequence *blanks; /* blank subject items */
- raptor_avltree *nodes; /* nodes */
+ raptor_avltree *subjects; /* subject items */
+ raptor_avltree *blanks; /* blank subject items */
+ raptor_avltree *nodes; /* nodes */
raptor_abbrev_node *rdf_type; /* rdf:type uri */
/* URI of rdf:XMLLiteral */
@@ -413,17 +413,16 @@
/* If this is only used as a 1 subject and object or never
* used as a subject or never used as an object, it never need
* be referenced with an explicit name */
- int idx;
raptor_abbrev_subject* blank;
raptor_xml_writer_start_element(context->xml_writer, element);
blank = raptor_abbrev_subject_find(context->blanks, node->type,
- node->value.blank.string, &idx);
+ node->value.blank.string);
if(blank) {
raptor_rdfxmla_emit_subject(serializer, blank, depth+1);
- raptor_sequence_set_at(context->blanks, idx, NULL);
+ raptor_abbrev_subject_invalidate(blank);
}
} else {
@@ -472,7 +471,7 @@
int rv = 0;
int i=0;
raptor_uri* base_uri=NULL;
-
+
RAPTOR_DEBUG5("Emitting subject list items for node %p refcount %d subject %d object %d\n",
subject->node,
subject->node->ref_count, subject->node->count_as_subject,
@@ -562,7 +561,7 @@
int rv = 0;
int i;
raptor_avltree_iterator* iter=NULL;
-
+
RAPTOR_DEBUG5("Emitting subject properties for node %p refcount %d subject %d object %d\n",
subject->node, subject->node->ref_count,
subject->node->count_as_subject,
@@ -732,6 +731,8 @@
raptor_uri *base_uri=NULL;
int subject_is_single_node;
+ if (!raptor_abbrev_subject_valid(subject)) return 0;
+
subject_is_single_node=(context->single_node &&
subject->node->type == RAPTOR_IDENTIFIER_TYPE_RESOURCE &&
raptor_uri_equals_v2(serializer->world,
@@ -887,24 +888,34 @@
raptor_rdfxmla_context* context=(raptor_rdfxmla_context*)serializer->context;
raptor_abbrev_subject* subject;
raptor_abbrev_subject* blank;
- int i;
-
- for(i=0; i < raptor_sequence_size(context->subjects); i++) {
- subject = (raptor_abbrev_subject* )raptor_sequence_get_at(context->subjects, i);
- if(subject)
+ raptor_avltree_iterator* iter=NULL;
+
+ iter = raptor_new_avltree_iterator(context->subjects, NULL, NULL, 1);
+ while (iter) {
+ subject = (raptor_abbrev_subject*)raptor_avltree_iterator_get(iter);
+ if (subject) {
raptor_rdfxmla_emit_subject(serializer, subject, context->starting_depth);
+ }
+ if (raptor_avltree_iterator_next(iter))
+ break;
}
-
-
- if(!context->single_node) {
+ if (iter)
+ raptor_free_avltree_iterator(iter);
+
+ if (!context->single_node) {
/* Emit any remaining blank nodes */
- for(i=0; i < raptor_sequence_size(context->blanks); i++) {
- blank = (raptor_abbrev_subject* )raptor_sequence_get_at(context->blanks, i);
- if(blank)
- raptor_rdfxmla_emit_subject(serializer, blank, context->starting_depth);
+ iter = raptor_new_avltree_iterator(context->blanks, NULL, NULL, 1);
+ while (iter) {
+ blank = (raptor_abbrev_subject*)raptor_avltree_iterator_get(iter);
+ if (blank) {
+ raptor_rdfxmla_emit_subject(serializer, blank, context->starting_depth);
+ }
+ if (raptor_avltree_iterator_next(iter))
+ break;
}
+ if (iter)
+ raptor_free_avltree_iterator(iter);
}
-
return 0;
}
@@ -953,10 +964,14 @@
context->namespaces=raptor_new_sequence(NULL, NULL);
context->subjects =
- raptor_new_sequence((raptor_sequence_free_handler *)raptor_free_abbrev_subject,NULL);
+ raptor_new_avltree(serializer->world,
+ (raptor_data_compare_function)raptor_abbrev_subject_cmp,
+ (raptor_data_free_function)raptor_free_abbrev_subject, 0);
context->blanks =
- raptor_new_sequence((raptor_sequence_free_handler *)raptor_free_abbrev_subject,NULL);
+ raptor_new_avltree(serializer->world,
+ (raptor_data_compare_function)raptor_abbrev_subject_cmp,
+ (raptor_data_free_function)raptor_free_abbrev_subject, 0);
context->nodes =
raptor_new_avltree(serializer->world,
@@ -1042,12 +1057,12 @@
}
if(context->subjects) {
- raptor_free_sequence(context->subjects);
+ raptor_free_avltree(context->subjects);
context->subjects=NULL;
}
if(context->blanks) {
- raptor_free_sequence(context->blanks);
+ raptor_free_avltree(context->blanks);
context->blanks=NULL;
}
@@ -1500,17 +1515,15 @@
if(node == predicate) {
add_property=0;
-
if(object->type == RAPTOR_IDENTIFIER_TYPE_ANONYMOUS) {
/* look for any generated blank node associated with this
* statement and free it
*/
- int idx=0;
- if(raptor_abbrev_subject_find(context->blanks, object_type,
- statement->object, &idx))
- raptor_sequence_set_at(context->blanks, idx, NULL);
+ raptor_abbrev_subject *blank =
+ raptor_abbrev_subject_find(context->blanks, object_type,
+ statement->object);
+ if (subject) raptor_avltree_delete(context->blanks, blank);
}
-
break;
}
}
Index: src/raptor_abbrev.c
===================================================================
--- src/raptor_abbrev.c (revision 15635)
+++ src/raptor_abbrev.c (working copy)
@@ -298,94 +298,10 @@
/**
- * raptor_abbrev_node_matches:
- * @node: #raptor_abbrev_node to compare
- * @node_type: Raptor identifier type
- * @node_data: For node_type RAPTOR_IDENTIFIER_TYPE_ORDINAL, int* to the
- * ordinal.
- * @datatype: Literal datatype or NULL
- * @language: Literal language or NULL
- *
- * INTERNAL - Compare an existing abbrev node against node described by
- * parameters
- *
- * Return value: non-zero if @node matches the node described by the rest of
- * the parameters.
- */
-int
-raptor_abbrev_node_matches(raptor_abbrev_node* node,
- raptor_identifier_type node_type,
- const void *node_data, raptor_uri *datatype,
- const unsigned char *language)
-{
- int rv = 0;
-
- if(node->type != node_type)
- return 0;
-
- switch (node->type) {
- case RAPTOR_IDENTIFIER_TYPE_RESOURCE:
- case RAPTOR_IDENTIFIER_TYPE_PREDICATE:
- rv = raptor_uri_equals_v2(node->world, node->value.resource.uri,
- (raptor_uri *)node_data);
- break;
-
- case RAPTOR_IDENTIFIER_TYPE_ANONYMOUS:
- rv = !strcmp((const char*)node->value.blank.string,
- (const char *)node_data);
- break;
-
- case RAPTOR_IDENTIFIER_TYPE_LITERAL:
- case RAPTOR_IDENTIFIER_TYPE_XML_LITERAL:
-
- if((char *)node->value.literal.string != NULL &&
- (char *)node_data != NULL) {
-
- /* string */
- rv = (strcmp((char *)node->value.literal.string,
- (char *)node_data) == 0);
-
- /* language */
- if((char *)node->value.literal.language != NULL &&
- (char *)language != NULL)
- rv &= (strcmp((char *)node->value.literal.language,
- (char *)language) == 0);
- else if((char *)node->value.literal.language != NULL ||
- (char *)language != NULL)
- rv= 0;
-
- /* datatype */
- if(node->value.literal.datatype != NULL && datatype != NULL)
- rv &= (raptor_uri_equals_v2(node->world, node->value.literal.datatype,datatype) !=0);
- else if(node->value.literal.datatype != NULL || datatype != NULL)
- rv = 0;
-
- } else {
- RAPTOR_FATAL1("string must be non-NULL for literal or xml literal\n");
- rv = 0;
- }
-
- break;
-
- case RAPTOR_IDENTIFIER_TYPE_ORDINAL:
- rv = (node->value.ordinal.ordinal == *(int *)node_data);
- break;
-
- case RAPTOR_IDENTIFIER_TYPE_UNKNOWN:
- default:
- /* Nothing to do */
- break;
- }
-
- return rv;
-}
-
-
-/**
* raptor_abbrev_node_lookup:
- * @nodes: Sequence of nodes to search
+ * @nodes: Tree of nodes to search
* @node_type: Raptor identifier type
- * @node_value: Node value to search with using raptor_abbrev_node_matches().
+ * @node_value: Node value to search for
* @datatype: Literal datatype or NULL
* @language: Literal language or NULL
* @created_p: (output parameter) set to non-0 if a node was created
@@ -528,13 +444,15 @@
subject = (raptor_abbrev_subject*)RAPTOR_CALLOC(raptor_subject, 1,
sizeof(raptor_abbrev_subject));
- if(subject) {
+ if (subject) {
subject->node = node;
subject->node->ref_count++;
subject->node->count_as_subject++;
subject->node_type = NULL;
+ subject->valid = 1;
+
subject->properties =
raptor_new_avltree(node->world,
(raptor_data_compare_function)raptor_compare_abbrev_po,
@@ -580,6 +498,22 @@
}
+int
+raptor_abbrev_subject_valid(raptor_abbrev_subject *subject)
+{
+ return subject->valid;
+}
+
+
+int
+raptor_abbrev_subject_invalidate(raptor_abbrev_subject *subject)
+{
+ subject->valid = 0;
+}
+
+
+
+
/**
* raptor_subject_add_property:
* @subject: subject node to add to
@@ -673,63 +607,69 @@
}
+int
+raptor_abbrev_subject_cmp(raptor_abbrev_subject* subject1,
+ raptor_abbrev_subject* subject2)
+{
+ return raptor_abbrev_node_cmp(subject1->node, subject2->node);
+}
+
+
raptor_abbrev_subject*
-raptor_abbrev_subject_find(raptor_sequence *sequence,
+raptor_abbrev_subject_find(raptor_avltree *subjects,
raptor_identifier_type node_type,
- const void *node_data, int *idx)
+ const void *node_data)
{
raptor_abbrev_subject* rv_subject = NULL;
- int i;
+ raptor_abbrev_node* lookup_node = NULL;
+ raptor_abbrev_subject* lookup = NULL;
+
+ /* datatype and language both null for a subject node */
- for(i=0; i < raptor_sequence_size(sequence); i++) {
- raptor_abbrev_subject* subject=(raptor_abbrev_subject*)raptor_sequence_get_at(sequence, i);
+ lookup_node = raptor_new_abbrev_node(subjects->world,
+ node_type, node_data, NULL, NULL);
+
+ lookup = raptor_new_abbrev_subject(lookup_node);
- if(subject &&
- raptor_abbrev_node_matches(subject->node, node_type, node_data, NULL, NULL)) {
- rv_subject = subject;
- break;
- }
-
- }
+ rv_subject = raptor_avltree_search(subjects, lookup);
- if(idx)
- *idx = i;
-
+ raptor_free_abbrev_subject(lookup);
+ raptor_free_abbrev_node(lookup_node);
+
return rv_subject;
}
raptor_abbrev_subject*
raptor_abbrev_subject_lookup(raptor_avltree* nodes,
- raptor_sequence* subjects,
- raptor_sequence* blanks,
+ raptor_avltree* subjects,
+ raptor_avltree* blanks,
raptor_identifier_type node_type,
const void *node_data,
int* created_p)
{
- raptor_sequence *sequence;
+ raptor_avltree *tree;
raptor_abbrev_subject* rv_subject;
- /* Search for specified resource in resources array.
- * FIXME: this should really be a hash, not a list.
+ /* Search for specified resource.
*/
- sequence= (node_type == RAPTOR_IDENTIFIER_TYPE_ANONYMOUS) ?
+ tree = (node_type == RAPTOR_IDENTIFIER_TYPE_ANONYMOUS) ?
blanks : subjects;
- rv_subject= raptor_abbrev_subject_find(sequence, node_type,
- node_data, NULL);
+ rv_subject= raptor_abbrev_subject_find(tree, node_type, node_data);
if(created_p)
*created_p=(!rv_subject);
/* If not found, create one and insert it */
if(!rv_subject) {
+
raptor_abbrev_node* node = raptor_abbrev_node_lookup(nodes, node_type,
node_data, NULL, NULL,
NULL);
if(node) {
rv_subject = raptor_new_abbrev_subject(node);
if(rv_subject) {
- if(raptor_sequence_push(sequence, rv_subject)) {
+ if(raptor_avltree_add(tree, rv_subject)) {
rv_subject = NULL;
}
}
_______________________________________________
redland-dev mailing list
[email protected]
http://lists.librdf.org/mailman/listinfo/redland-dev