1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
| def create_features_and_labels_set(pos, neg, hm_lines=10000000, word_frequency_min=50, word_frequency_max=1000, test_size=0.2, dataset_name='sentiment_dataset.pickle'): """ 将文本处理成机器学习算法可以使用的特征向量数据集。 :param pos: 积极文本。 :param neg: 消极文本。 :param hm_lines: 支持的最大行数。 :param word_frequency_min: 可放入字典中的单词在文本中出现的最小次数。 :param word_frequency_max: 可放入字典中的单词在文本中出现的最大次数。 :param test_size: 测试集的比例大小设置 :param dataset_name: 获取数据集,将其保存的名称 :return: x_train, y_train, x_test, y_test """ if os.path.exists(dataset_name): print "loading dataset..." try: with open(dataset_name, 'rb') as dataset: return pickle.load(dataset) except Exception, e: print "try to created again, due to: '%s'" % e
features = []
lexicon = create_lexicon(pos, neg, hm_lines, word_frequency_min, word_frequency_max) features += sample_handling(pos, hm_lines, lexicon, [1, 0]) features += sample_handling(neg, hm_lines, lexicon, [0, 1])
random.shuffle(features)
features = np.array(features)
test_size = int(test_size * len(features))
x_train = list(features[:, 0][:-test_size]) y_train = list(features[:, 1][:-test_size]) x_test = list(features[:, 0][-test_size:]) y_test = list(features[:, 1][-test_size:])
with open(dataset_name, 'wb') as dataset: pickle.dump([x_train, y_train, x_test, y_test], dataset)
return x_train, y_train, x_test, y_test
|