docu_set = { 'd1': 'i love shanghai', 'd2': 'i am from shanghai now i study in tongji university', 'd3': 'i am from lanzhou now i study in lanzhou university of science and technolgy', }
下面用这张表做一个简单的搜索引擎,采用倒排索引 首先对所有文档做分词,得到文章的词向量集合
1 2 3 4 5 6
all_words = [] for i in docu_set.values(): cut = i.split() all_words.extend(cut) set_all_words = set(all_words) print(set_all_words)
invert_index = dict() for b in set_all_words: temp = [] for j in docu_set.keys(): field = docu_set[j] split_field = field.split() if b in split_field: temp.append(j) invert_index[b] = temp print(invert_index)