Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/195#discussion_r151226712 --- Diff: src/ports/postgres/modules/graph/hits.py_in --- @@ -95,234 +109,391 @@ def hits(schema_madlib, vertex_table, vertex_id, edge_table, edge_args, FROM {1} """.format(vertex_id, vertex_table))[0]["cnt"] - # Assign default threshold value based on number of nodes in the graph. if threshold is None: - threshold = 1.0 / (n_vertices * 1000) + threshold = get_default_threshold_for_centrality_measures(n_vertices) + # table/column names used when grouping_cols is set. edge_temp_table = unique_string(desp='temp_edge') + grouping_cols_comma = grouping_cols + ',' if grouping_cols else '' distribution = ('' if is_platform_pg() else - "DISTRIBUTED BY ({0})".format(dest)) - plpy.execute("DROP TABLE IF EXISTS {0}".format(edge_temp_table)) + "DISTRIBUTED BY ({0}{1})".format( + grouping_cols_comma, dest)) + drop_tables([edge_temp_table]) plpy.execute(""" CREATE TEMP TABLE {edge_temp_table} AS SELECT * FROM {edge_table} {distribution} """.format(**locals())) - # GPDB and HAWQ have distributed by clauses to help them with indexing. - # For Postgres we add the index explicitly. - if is_platform_pg(): - plpy.execute("CREATE INDEX ON {0}({1})".format( - edge_temp_table, dest)) + ###################################################################### + # Set initial authority_norm and hub_norm as 1, so that later the final + # norm should be positive number + authority_init_value = 1.0 + hub_init_value = 1.0 + + subquery1 = unique_string(desp='subquery1') + + distinct_grp_table = '' + select_grouping_cols_comma = '' + select_subquery1_grouping_cols_comma = '' --- End diff -- While this may not be perfect, we felt that it is slightly better than having short variable names that do not convey much meaning at all.
---