This is a knn algorithm for articles that I have gotten. Then determines which category it belongs to. I am not getting very good results :/
k = 23 training_folder = './data/training/' minn_folder = training_folder + 'Minnesota/' health_folder = training_folder + 'Health/' def remove_punctuation(text): return regex.sub(r'\p{P}+', "", text) def file_list(folder): return [f for f in listdir(folder) if isfile(join(folder, f))] def all_file_list(): minn_files = file_list(minn_folder) for i in range(len(minn_files)): minn_files[i] = minn_folder + minn_files[i] health_files = file_list(health_folder) for i in range(len(health_files)): health_files[i] = health_folder + health_files[i] return minn_files + health_files def file_to_word_list(f): fr = open(f, 'r') text_read = fr.read() text = remove_punctuation(text_read) return text.split() def get_vocabularies(all_files): voc = {} for f in all_files: words = file_to_word_list(f) for w in words: voc[w] = 0 return voc def load_training_data(): all_files = all_file_list() voc = get_vocabularies(all_files) training_data = [] for f in all_files: tag = f.split('/')[3] point = copy.deepcopy(voc) words = file_to_word_list(f) for w in words: point[w] += 1 d = {'tag': tag, 'point': point} training_data.append(d) return training_data def get_distance(p1, p2): sq_sum = 0 for w in p1: if w in p2: sq_sum += pow(p1[w] - p2[w], 2) return math.sqrt(sq_sum) # This function is implemented for seeing insights of training data def show_distances(training_data): for i in range(len(training_data)): for j in range(i + 1, len(training_data)): print('d(' + str(i) + ',' + str(j) + ')=') print(get_distance(training_data[i]['point'], training_data[j]['point'])) print() for i in range(len(training_data)): print(training_data[i]['tag']) def test(training_data, txt_file): dist_list = [] txt = {} item = {} max_i = 0 words = file_to_word_list(txt_file) for w in words: if w in txt: txt[w] += 1 else: txt[w] = 1 for pt in training_data: item['tag'] = pt['tag'] item['distance'] = get_distance(pt['point'], txt) if len(dist_list) < k: dist_list.append(copy.deepcopy(item)) else: for i in range(1, k): if dist_list[i]['distance'] > dist_list[max_i]['distance']: max_i = i if dist_list[max_i]['distance'] > item['distance']: dist_list[max_i] = item vote_result = {} for d in dist_list: if d['tag'] in vote_result: vote_result[d['tag']] += 1 else: vote_result[d['tag']] = 1 # print(vote_result) # for testing result = dist_list[0]['tag'] for vote in vote_result: if vote_result[vote] > vote_result[result]: result = vote return result def main(txt): td = load_training_data() print(show_distances(td)) # show_distances(td) # for test usage only print('Category: ' + test(td, txt)) if __name__ == '__main__': main(sys.argv[1]) -- https://mail.python.org/mailman/listinfo/python-list