!pip install utils
!pip install sklearn
!mkdir '/content/gdrive/My Drive/movie review' !mkdir '/content/gdrive/My Drive/movie review/good/' !mkdir '/content/gdrive/My Drive/movie review/bad/'
from utils import * import tensorflow as tf import sklearn from sklearn import datasets from sklearn.model_selection import train_test_split import time import re import numpy as np def clearstring(string): #只选择包含字母和数字的字符串 string = re.sub('[^A-Za-z0-9]+', ' ', string) #把句子分割成多个单词合成的队列 string = string.split(' ') string = filter(None, string) #消除单词首尾空格 string = [y.strip() for y in string] string = ' '.join(string) return string.lower() def seperate_dataset(trainset, ratio=0.5): ''' 把文本中语句一条条分隔开,并打上标签 ''' text = [] label = [] for i in range(int(len(trainset.data) * ratio)): #把文本分割成多个句子 data_ = trainset.data[i].split('\n') #过滤掉空行 data_ = list(filter(None, data_)) for n in range(len(data_)): #去掉句子中不符合规则的单词 data_[n] = clearstring(data_[n]) text += data_ for n in range(len(data_)): #打上标签,因为目录下只有两个文件夹因此只有两种标签 label.append(trainset.target[i]) return text, label
s = ' this is 98 !@# *q' s = clearstring(s) print(s)
path = '/content/gdrive/My Drive/movie review' trainset = sklearn.datasets.load_files(container_path = path, encoding = 'UTF-8') ''' 将文本中的句子抽取出来,形成两个集合,由于文件夹下只有两个子文件夹 因此它们对应两个标签 ''' trainset.data, trainset.target = seperate_dataset(trainset, 1.0) print(trainset.target_names) print('training data has {0} items'.format(len(trainset.data)))
#为每条记录附带两个标志位 onehot = np.zeros((len(trainset.data), len(trainset.target_names))) #属于bad目录下的句子对应标签[1,0],属于good目录下的句子附带标签[0,1] onehot[np.arange(len(trainset.data)), trainset.target] = 1.0 ''' 将trainset.data, trianset.target, onehot三个数组以8:2的方式分成两部分 一部分用于测试 ''' train_X, test_X, train_Y, test_Y, train_onehot,test_onehot = train_test_split(trainset.data, trainset.target, onehot, test_size = 0.2) concat = ' '.join(trainset.data).split() vocabulary_size = len(list(set(concat))) print(vocabulary_size)
print(train_onehot[0])
import collections def build_dataset(words, n_words): ''' 将文本中的单词拆解成字典 ''' count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]] #统计每个单词的出现次数,我们只选出现次数在前n_words范围内的单词 count.extend(collections.Counter(words).most_common(n_words - 1)) dictionary = dict() for word, _ in count: #给每个单词进行编号,编号会从4开始 dictionary[word] = len(dictionary) data = list() unk_count = 0 for word in words: index = dictionary.get(word, 0) #统计没有被选入范围的单词 if index == 0: unk_count += 1 data.append(index) count[0][1] = unk_count #把编号和单词对应关系对换 reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reversed_dictionary
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size) print('Most common words', count[4:10]) print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
''' 网络分两层,第一层128个节点,第二层也是128个节点, ''' size_layer = 128 num_layers = 2 embedded_size = 128 dimension_output = len(trainset.target_names) learning_rate = 1e-3 maxlen = 50 batch_size = 128 class RNN: def __init__(self, size_layer, num_layer, embedded_size, dict_size, dimension_output, learning_rate): def cells(reuse = False): ''' tensorflow封装了RNN节点,它跟我们前面描述的能在内部记录当前输入数据处理信息, 并将信息传递到下一次数据处理的R节点一样 ''' return tf.nn.rnn_cell.BasicRNNCell(size_layer, reuse=reuse) #定义输入数据变量 self.X = tf.placeholder(tf.int32, [None, None]) #定义输出数据变量 self.Y = tf.placeholder(tf.float32, [None, dimension_output]) ''' 定义embedding层,这一层与我们前面讲单词向量训练时提到过的网络第一层一样,它对应 一个二维矩阵,矩阵的每一行表示单词向量,self.X是one-hont-vector,它会将矩阵的某一行 挑选出来 ''' embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1)) embedded = tf.nn.embedding_lookup(embeddings, self.X) #串联两个RNN节点增强识别能力 rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]) #根据输入数据长度构造相应数量RNN节点,形成RNN网络层 outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype=tf.float32) #将RNN网络层输出的含有128个分量的向量转换为只有2个分量的向量 W = tf.get_variable('w', shape=(size_layer, dimension_output), initializer=tf.orthogonal_initializer()) b = tf.get_variable('b', shape=(dimension_output), initializer=tf.zeros_initializer()) #将两个分量重,数值较大的那个当做当前语句所属分类 self.logits = tf.matmul(outputs[:, -1], W) + b self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.Y)) self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost) correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
''' 网络分两层,第一层128个节点,第二层也是128个节点, ''' size_layer = 128 num_layers = 2 embedded_size = 128 dimension_output = len(trainset.target_names) learning_rate = 1e-3 maxlen = 50 batch_size = 128 class RNN: def __init__(self, size_layer, num_layer, embedded_size, dict_size, dimension_output, learning_rate): def cells(reuse = False): ''' tensorflow封装了RNN节点,它跟我们前面描述的能在内部记录当前输入数据处理信息, 并将信息传递到下一次数据处理的R节点一样 ''' return tf.nn.rnn_cell.LSTMCell(size_layer, reuse=reuse) #定义输入数据变量 self.X = tf.placeholder(tf.int32, [None, None]) #定义输出数据变量 self.Y = tf.placeholder(tf.float32, [None, dimension_output]) ''' 定义embedding层,这一层与我们前面讲单词向量训练时提到过的网络第一层一样,它对应 一个二维矩阵,矩阵的每一行表示单词向量,self.X是one-hont-vector,它会将矩阵的某一行 挑选出来 ''' embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1)) embedded = tf.nn.embedding_lookup(embeddings, self.X) #串联两个RNN节点增强识别能力 rnn_cells = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)]) #根据输入数据长度构造相应数量RNN节点,形成RNN网络层 outputs, _ = tf.nn.dynamic_rnn(rnn_cells, embedded, dtype=tf.float32) #将RNN网络层输出的含有128个分量的向量转换为只有2个分量的向量 W = tf.get_variable('w', shape=(size_layer, dimension_output), initializer=tf.orthogonal_initializer()) b = tf.get_variable('b', shape=(dimension_output), initializer=tf.zeros_initializer()) #将两个分量重,数值较大的那个当做当前语句所属分类 self.logits = tf.matmul(outputs[:, -1], W) + b self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.Y)) self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost) correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
''' 将输入语句中的单词转换成对应编号 ''' def word_to_index(corpus, dic, maxlen, UNK=3): X = np.zeros((len(corpus), maxlen)) for i in range(len(corpus)): ''' 规定一句话单词量不能超过maxlen,超过了就截断。然后从最后一个单词开始,到第一个单词, 将每个单词转换为对应编号 ''' for no, k in enumerate(corpus[i].split()[:maxlen][::-1]): try: X[i, -1 - no] = dic[k] except: X[i, -1 - no] = UNK return X s = [] s.append("the rock is destined to be") x = word_to_index(s, dictionary, maxlen) print(x)
import time tf.reset_default_graph() sess = tf.InteractiveSession() rnn = RNN(size_layer, num_layers, embedded_size, vocabulary_size+4, dimension_output, learning_rate) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(), max_to_keep=2) checkpint_dir = '/content/gdrive/My Drive/dataset/checkpoints_basci_rnn' EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 30, 0, 0, 0 while True: lasttime = time.time() if CURRENT_CHECKPOINT == EARLY_STOPPING: print('break epoch: %d\n' % (EPOCH)) break train_acc, train_loss, test_acc,test_loss = 0,0,0,0 #训练网络 for i in range(0, (len(train_X) // batch_size) * batch_size, batch_size): batch_x = word_to_index(train_X[i : i + batch_size], dictionary, maxlen) acc, loss, _ = sess.run([rnn.accuracy, rnn.cost, rnn.optimizer], feed_dict = {rnn.X: batch_x, rnn.Y: train_onehot[i : i + batch_size]}) train_loss += loss train_acc += acc #检测训练结果 for i in range(0, (len(test_X) // batch_size) * batch_size, batch_size): batch_x = word_to_index(test_X[i : i + batch_size], dictionary, batch_size) acc, loss = sess.run([rnn.accuracy, rnn.cost], feed_dict = {rnn.X : batch_x, rnn.Y : test_onehot[i : i + batch_size]}) test_loss += loss test_acc += acc train_loss /= (len(train_X) // batch_size) train_acc /= (len(train_X) // batch_size) test_loss /= (len(test_X) // batch_size) test_acc /= (len(test_X) // batch_size) if test_acc > CURRENT_ACC: print('epoch: %d, pass acc: %f, current acc %f' % (EPOCH, CURRENT_ACC, test_acc)) CURRENT_ACC = test_acc CURRENT_CHECKPOINT = 0 else: CURRENT_CHECKPOINT += 1 print('time taken: ', time.time() - lasttime) print('epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n' % (EPOCH, train_loss, train_acc, test_loss, test_acc)) path = saver.save(sess, checkpint_dir, global_step = EPOCH) EPOCH += 1
© 著作权归作者所有
举报
相关热门文章
发表评论
0/200