Friendly greetings !
i'm on the same problem since many days (an hour per day) and i can't
find a solution
i have 2 index (see source doe below)
No problem with the "parsed" index, but the "url" index never return any result.
I don't if it's because the url isn't indexed or because the query on
the index is wrong.
Or something else ?

Could you please take a look and see what's wrong ?
thank you

(you can try to run the script, it works)

require 'nokogiri'
require 'open-uri'
require 'neography'

#init neography
@neo = Neography::Rest.new
neo_root = @neo.get_root

domaine = 'http://www.over-blog.com/'
parsed_idx = "ob_parsed_idx"
url_idx = "ob_url_idx"

#FIRST RUN
#ob_root_node = @neo.create_node("domaine" => domaine, "parsed" =>
"false", "url" => domaine)
#@neo.create_relationship("obgraph", neo_root, ob_root_node)
#pidx = @neo.create_node_index(parsed_idx)
#uidx = @neo.create_node_index(url_idx)
#@neo.add_node_to_index(parsed_idx, "parsed", "false", ob_root_node)
##@neo.add_node_to_index(url_idx, "url", domaine, ob_root_node)
#node_to_parse = @neo.get_node_index(parsed_idx, "parsed", "false")

ob_root_node = @neo.traverse(neo_root, "nodes", { "relationships" =>
[{"type"=> "obgraph", "direction" => "out" }], "depth" => 1})
#node_to_parse = @neo.traverse(ob_root_node, "nodes", {
"relationships" => [{"type"=> "link", "direction" => "out" }] })
node_to_parse = @neo.get_node_index(parsed_idx, "parsed", "false")

#print @neo.list_node_indexes

node_to_parse.each do |node|

    url_to_parse = @neo.get_node_properties(node)["url"]
    printf("exploring : %s\n", url_to_parse)

    doc = Nokogiri::HTML(open(url_to_parse))
    @neo.set_node_properties(node, {"parsed" => "true"})
    @neo.remove_node_from_index(parsed_idx, node)
    @neo.add_node_to_index(parsed_idx, "parsed", "true", node)

    doc.xpath('//a').each do |link|

        link_text = link.content.strip()
        link_url = link['href'].to_s().strip()
        link_title = link['title'].to_s().strip()

        link_url = link_url.sub(/#.*$/, "")

        if(link_url =~ /^\/.*/)
            link_url = link_url.sub(/^\//, '')
            link_url = domaine + link_url
        end

        if(link_text == '')
            link_text = link_title
        end


        #skiping empty stuff
        next if link_url.empty?
        next if link_text.empty?

        node_found = @neo.find_node_index(url_idx, "url", link_url)
        #node_found = @neo.traverse(ob_root_node, "nodes", {
"relationships" => [{"direction" => "out" }], "prune evaluator" =>
{"language" => "javascript", "body" =>
"position.endNode().getProperty(url) == #{link_url};"}, "return
filter" => {"language" => "builtin",  "name" => "all but start
node"}})
        print "\nsearching url #{link_url}\n"
        printf("node_found : %s \n", node_found)
        if(node_found.nil?)
            printf("create node %s\n", link_url)
            nnode = @neo.create_node("parsed" => "false", "url" => link_url)
            @neo.add_node_to_index(url_idx, "url", link_url, nnode)
            @neo.add_node_to_index(parsed_idx, "parsed", "false", nnode)
        else
            printf("node_found : %s \n", node_found)
        end


        nrel = @neo.create_relationship("link", node, nnode)
        @neo.set_relationship_properties(nrel, {"text" => link_text})

        #printf("%s => %s\n", link_text, link_url)

    end

    sleep(1.0)


end


-- 
Laurent "ker2x" Laborde
Sysadmin & DBA at http://www.over-blog.com/
_______________________________________________
Neo4j mailing list
User@lists.neo4j.org
https://lists.neo4j.org/mailman/listinfo/user

Reply via email to