Hi all, My name is Fábio and I'm new in scikit, and I trying to cluster information from one file with python script (i fount on web). But i saw that the output had problem with numbers...See: Script# import clickimport reimport numpyimport random from collections import defaultdict from sklearn.feature_extraction.text import TfidfVectorizerfrom sklearn.cluster import KMeans
@click.command()@click.argument('filename')@click.option('--clusters', default=50, help='Number of clusters')@click.option('--sample', default=400, help='Number of samples to print')def cluster_lines(filename, clusters, sample): lines = numpy.array(list(_get_lines(filename))) doc_feat = TfidfVectorizer().fit_transform(lines) km = KMeans(clusters).fit(doc_feat) k = 0 clusters = defaultdict(list) for i in km.labels_: clusters[i].append(lines[k]) k += 1 s_clusters = sorted(clusters.values(), key=lambda l: -len(l)) for cluster in s_clusters: print 'Cluster [%s]:' % len(cluster) if len(cluster) > sample: cluster = random.sample(cluster, sample) for line in cluster: print line print '--------' def _clean_line(line): line = line.strip().lower() line = re.sub('\d+', '(N)', line) return line def _get_lines(filename): for line in open(filename).readlines(): yield _clean_line(line) if __name__ == '__main__': cluster_lines() output [root@vmcaiosyscolprod01 71001492]# python Cluster-LearnMachine.py DataSets/ospf.teste3Cluster [7]:"rjbotaa max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area""rjmteab max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area""rjmckaa max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area""rjdqcaa max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area""rjdqcab max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area""rjcenaa max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area""rjcenab max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"--------Cluster [1]:"rjbotab max-metric router-lsa on-startup log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"--------Cluster [1]:"rjmteaa ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"--------Cluster [1]:"rjmckab max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"-------- See that the output shown (N) on numbers, and i'm not fount a way to use the big cluster as a template fo fount diference between the bigger cluster and others clusters. How can i do that? Thanks
_______________________________________________ scikit-learn mailing list scikit-learn@python.org https://mail.python.org/mailman/listinfo/scikit-learn