word2vec简单实现

## 梗概

word2vec算是embedding中近年来比较火的方法了。从实践角度来讲有CBOWskip-gram两套思路,一种是从context推断local,而另一种是从local推断context。但是我们其实并不关心模型的输出,而只关心模型内部的隐层

核心代码

使用pytorch构建了一套标准pipeline

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import torch
import numpy
from torch import nn
from functools import reduce
from torch.utils.data import DataLoader, Dataset
from torch.nn import Module
import matplotlib.pyplot as plt

class WordsVec(Dataset):
def __init__(self, datafile="dataset/sentences.txt", window=2):
super(WordsVec, self).__init__()
self.datafile = datafile
self.tokens = self._unique_words()
self.window = window
self.tokensize = len(self.tokens)
self.paired_dataset = self._prepare_dataset("kgram")

def _unique_words(self):
with open(self.datafile, "r") as fp:
ts = fp.read()
self.sentences = ts.split('\n')
self.tokenized_sentences = list(map(lambda x: x.split(" "), self.sentences))
tokens = list(set(reduce(lambda x, y: x+y, self.tokenized_sentences)))
self.token_id_dict = {token: ids for ids, token in enumerate(tokens)}
return tokens

def _prepare_dataset(self, *args):
dataset = []
for line in self.tokenized_sentences:
if len(line) < 2*self.window + 1:
pass
else:
for ps in range(self.window, len(line)-self.window):
# ps is the pointer to the current position, which has inner-padding size of `self.window`
target = line[ps]
upper_pointer = downer_point = ps
for i in range(1, self.window+1):
if args[0] == "kgram":
dataset.append([self._one_hot_word(target),
self.word2id(line[upper_pointer+i])])
dataset.append([self._one_hot_word(target),
self.word2id(line[downer_point-i])])
else:
dataset.append([self._one_hot_word(line[upper_pointer+i]),
self._one_hot_word(target)])
dataset.append([self._one_hot_word(line[downer_point-i]),
self._one_hot_word(target)])
return dataset

def _one_hot(self, index):
hotvector = torch.zeros(self.maxwordlen)
hotvector[index] = 1
return hotvector

def _one_hot_word(self, word):
hotvector = torch.zeros(self.maxwordlen)
hotvector[self.token_id_dict[word]] = 1
return hotvector

def id2word(self, ids):
return self.tokens[ids]

def word2id(self, word):
return self.token_id_dict[word]

def __len__(self):
return len(self.paired_dataset)

def __getitem__(self, item):
return self.paired_dataset[item]

@property
def maxwordlen(self):
return self.tokensize

train_set = WordsVec()
train_loader = DataLoader(train_set, batch_size=8, shuffle=True)

class TwoProjection(Module):
def __init__(self, word_len, hidden_size):
super(TwoProjection, self).__init__()
self.W = nn.Parameter(torch.randn(word_len, hidden_size))
self.V = nn.Parameter(torch.randn(hidden_size, word_len))

def forward(self, hot_vec):
hidden_layer = torch.matmul(hot_vec, self.W)
output_layer = torch.matmul(hidden_layer, self.V)
return output_layer

twoproj = TwoProjection(train_set.maxwordlen, 2)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(twoproj.parameters(), lr=0.001)

def trainModel():
total_loss = 0
for i, datax in enumerate(train_loader):
inputs, label = datax
pred = twoproj(inputs)
loss = criterion(pred, label)
total_loss += loss
optimizer.zero_grad()
loss.backward()

optimizer.step()
print("\t", total_loss)

if __name__ == "__main__":
for epochs in range(2000):
print("Epoch:{}".format(epochs))
trainModel()

for i, label in enumerate(train_set.tokens):
W, WT = twoproj.parameters()
# W是词向量矩阵
x, y = float(W[i][0]), float(W[i][1])
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

这套代码对于网络核心层采用了网上提供的版本,采用了nn.Parameter模块。最早的版本使用了torch.nn.Linear来完成核心投影层的构建,因为Linear模块是可以定义输入维数和输出维数的,对于$y=wx$这样的线性回归,如果$y\in \mathbb{R^v}, x\in \mathbb{R^n}$, 那么只要$w \in \mathbb{R^{n\times v}}$就能完成维度的映射。实际上使用线形层是没有问题的,只不过最后输出词向量的时候需要使用第一层的权重矩阵点乘词语的onehot向量。下面给出修改后的部分代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# Structure of the Network
class TwoProjection(Module):
def __init__(self, word_len, hidden_size):
super(TwoProjection, self).__init__()
self.l1 = nn.Linear(word_len, hidden_size, bias=False)
self.l2 = nn.Linear(hidden_size, word_len, bias=False)

def forward(self, hot_vec):
hidden_layer = self.l1(hot_vec)
output_layer = self.l2(hidden_layer)
return output_layer

# Convert a word into vector and visualization
if __name__ == "__main__":
# training here
for i, label in enumerate(train_set.tokens):
W = twoproj.l1.weight.data
b = twoproj.l1.bias.data
x, y = torch.matmul(W, train_set._one_hot_word(label))
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

结果

尝试在模型中开启Linear层中加入偏置量,发现也可以得到预期的结果! file 这是代码运行的结果。其中使用Linear模块epoch在200就可以达到很好的映射效果,但是如果使用Parameter模块,达到同样的映射效果至少需要linear两倍的epoch。

本文作者:MyTech::Author

本文链接: https://mytech.pages.dev/2021/10/29/echo_353/