1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
   | def create_features_and_labels_set(pos, neg, hm_lines=10000000, word_frequency_min=50,                                    word_frequency_max=1000, test_size=0.2,                                    dataset_name='sentiment_dataset.pickle'):     """     将文本处理成机器学习算法可以使用的特征向量数据集。     :param pos: 积极文本。     :param neg: 消极文本。     :param hm_lines: 支持的最大行数。     :param word_frequency_min: 可放入字典中的单词在文本中出现的最小次数。     :param word_frequency_max: 可放入字典中的单词在文本中出现的最大次数。     :param test_size: 测试集的比例大小设置     :param dataset_name: 获取数据集,将其保存的名称     :return: x_train, y_train, x_test, y_test     """     if os.path.exists(dataset_name):         print "loading dataset..."         try:             with open(dataset_name, 'rb') as dataset:                 return pickle.load(dataset)         except Exception, e:             print "try to created again, due to: '%s'" % e
      features = []
      lexicon = create_lexicon(pos, neg, hm_lines, word_frequency_min, word_frequency_max)     features += sample_handling(pos, hm_lines, lexicon, [1, 0])     features += sample_handling(neg, hm_lines, lexicon, [0, 1])
      random.shuffle(features)
      features = np.array(features)
      test_size = int(test_size * len(features))
      x_train = list(features[:, 0][:-test_size])     y_train = list(features[:, 1][:-test_size])     x_test = list(features[:, 0][-test_size:])     y_test = list(features[:, 1][-test_size:])
      with open(dataset_name, 'wb') as dataset:         pickle.dump([x_train, y_train, x_test, y_test], dataset)
      return x_train, y_train, x_test, y_test
   |