importtorchimporttorch.nn.functionalasFimportmatplotlib.pyplotasplt%matplotlibinlinedevice=torch.device("cuda:0"iftorch.cuda.is_available()else"cpu")# Use GPU if available (faster calculations with PyTorch)
words=open('names.txt','r').read().splitlines()#Python list of stringsprint("First 10 names: ",words[:10])# First ten names, each as separate stringprint("Dataset size: ",len(words))# Amount of words in datasetprint("Shortest name: ",min(len(w)forwinwords))# Smallest word in datasetprint("Longest name: ",max(len(w)forwinwords))# Longest word in dataset
b={}forwinwords:chs=['<S>']+list(w)+['<E>']forch1,ch2inzip(chs,chs[1:]):# Neat way for two char 'sliding-window'bigram=(ch1,ch2)# bigram is the (ch1, ch2) tupelb[bigram]=b.get(bigram,0)+1# If tupel count not ex. -> 0 + 1
# b.items() returns tupels like (('<S>', a), 34)# sorted() would sort items by tupel, not amount# to sort by amount: lambda function replaces key with value (amount) high->lowsorted(b.items(),key=lambdakeyvalue:-keyvalue[1])
N=torch.zeros((28,28),dtype=torch.int32)# datatype would otherwise be float32 by default# Problem: We'll have only chars, but below we index using ints -> Need for mappingchars=sorted(list(set(''.join(words))))# set(): Throwing out letter duplicates# A mapping from letter to numberstoi={s:ifori,sinenumerate(chars)}stoi['<S>']=26stoi['<E>']=27# Copied from above, but now modified for mappingforwinwords:chs=['<S>']+list(w)+['<E>']# Neat way for two char 'sliding-window'forch1,ch2inzip(chs,chs[1:]):ix1=stoi[ch1]ix2=stoi[ch2]N[ix1,ix2]+=1# Increment cell in 2D by 1
itos={i:sfors,iinstoi.items()}# Basically reversing stoi element orderplt.figure(figsize=(16,16))plt.imshow(N,cmap='Blues')# Heatmap basicallyforiinrange(28):forjinrange(28):chstr=itos[i]+itos[j]# Add text for heat tilesplt.text(j,i,chstr,ha="center",va="bottom",color="gray")plt.text(j,i,N[i,j].item(),ha="center",va="top",color="gray")plt.axis('off')
N=torch.zeros((27,27),dtype=torch.int32)# 28x28 -> 27x27chars=sorted(list(set(''.join(words))))stoi={s:i+1fori,sinenumerate(chars)}stoi['.']=0# Our special character now has position zeroitos={i:sfors,iinstoi.items()}# Copied from above, but now modifiedforwinwords:chs=['.']+list(w)+['.']forch1,ch2inzip(chs,chs[1:]):# Neat way for two char 'sliding-window'ix1=stoi[ch1]ix2=stoi[ch2]N[ix1,ix2]+=1# Increment cell in 2D by 1plt.figure(figsize=(16,16))plt.imshow(N,cmap='Blues')# Heatmap basicallyforiinrange(27):forjinrange(27):chstr=itos[i]+itos[j]# Add text for heat tilesplt.text(j,i,chstr,ha="center",va="bottom",color="gray")plt.text(j,i,N[i,j].item(),ha="center",va="top",color="gray")plt.axis('off')
# Sampling from these distributions# Torch.multinomial -> "Give me probability, I'll give you integer"# We'll use a PyTorch Generator to make things repeatable (deterministic)g=torch.Generator().manual_seed(2147483647)p=torch.rand(3,generator=g)# Generate three random numbers [0;1]p=p/p.sum()# compact these random numbers into a distribution# output: [0.6064, 0.3033, 0.0903]print(p)
# With probability distribution p, create a list of 20 samples# [replacement: true] means drawing an element doesn't invalidate drawing this element againtorch.multinomial(p,num_samples=20,replacement=True,generator=g)# We'd expect ~60% of the 20 items to be 0, ~30% to be 1 , ~10% to be 2# output: [1, 1, 2, 0, 0, 2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1]
p=N[0].float()# probability vectorp=p/p.sum()# normalized probability distributionsg=torch.Generator().manual_seed(2147483647)ix=torch.multinomial(p,num_samples=1,replacement=True,generator=g).item()# This is an index, a number representing a letter by probabilityprint(itos[ix])# Convert index to letter
g=torch.Generator().manual_seed(2147483647)n=20foriinrange(n):ix=0# Start with special ('.', 'letter') token rowout=[]# hold the n names to be generatedwhileTrue:p=N[ix].float()# probability vectorp=p/p.sum()# normalized probability distributions# draw a single sample from this distribution, set this as new row indexix=torch.multinomial(p,num_samples=1,replacement=True,generator=g).item()out.append(itos[ix])# if we find ourselves back in the special ('.', 'letter') row, we're done with this nameifix==0:breakprint(''.join(out))
junide.
janasah.
p.
cony.
a.
nn.
kohin.
tolian.
juee.
ksahnaauranilevias.
dedainrwieta.
ssonielylarte.
faveumerifontume.
phynslenaruani.
core.
yaenon.
ka.
jabdinerimikimaynin.
anaasn.
ssorionsush.
Et voilà,出现了像 junide 这样糟糕的名字建议。
尽管它的表现很糟糕,但它还算合理。尽管效率不高。让我们来改进这一点。
使用 p = N[ix].float() # 概率向量 时,我们总是在获取一行,并总是将这一行完全从整数转换为浮点数。
此外,每次迭代我们也会做 p = p / p.sum()。
最好是为此准备一个专门的、预处理过的矩阵 P;只是一个计算好的概率矩阵。
作为一步旁路,我们使用 P 来按行求和。这在之前是所有 27 个字母的 p.sum()。
P=N.float()# P /= P.sum() # This would sum over all elements, row- and column-wise -> wrong# This is allowed with PyTorch:P/=P.sum(1,keepdims=True)# sum: A 27x1 vector (1 stands for row-wise sum) (27 by 27 divided by 27 by 1 is possible in PyTorch -> broadcasting)# For broadcasting to work like here, each dimension must be either equal or 1 (or not existent), which is the case here (dimensions will be aligned from right to left!)# Keepdim=True means that the sum vector is 27x1, (the 1 before that stating that the no. of rows is to be kept, but columns are to be summed over per row)g=torch.Generator().manual_seed(2147483647)foriinrange(20):ix=0out=[]# Hold multiple nameswhileTrue:p=P[ix]# draw a single sample from this distributionix=torch.multinomial(p,num_samples=1,replacement=True,generator=g).item()# If stopping special character is drawnout.append(itos[ix])ifix==0:breakprint(''.join(out))
junide.
janasah.
p.
cony.
a.
nn.
kohin.
tolian.
juee.
ksahnaauranilevias.
dedainrwieta.
ssonielylarte.
faveumerifontume.
phynslenaruani.
core.
yaenon.
ka.
jabdinerimikimaynin.
anaasn.
ssorionsush.
# Copied from above, but now modifiedforwinwords[:1]:chs=['.']+list(w)+['.']forch1,ch2inzip(chs,chs[1:]):# Neat way for two char 'sliding-window'ix1=stoi[ch1]ix2=stoi[ch2]prob=P[ix1,ix2]print(f'{ch1}{ch2}: {prob:.4f}')
log_likelihood=0.0n=0# tuple count# copied from above, but now modified - Log likelihood over all wordsforwinwords:chs=['.']+list(w)+['.']forch1,ch2inzip(chs,chs[1:]):# Neat way for two char 'sliding-window'ix1=stoi[ch1]ix2=stoi[ch2]prob=P[ix1,ix2]logprob=torch.log(prob)log_likelihood+=logprobn+=1# print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')print(f'{log_likelihood=}')# As this is a tensor and we want to see that toonll=-log_likelihoodprint(f'{nll=}')# Negative log likelihoodprint(f'{nll/n}')# Average negative log likelihood (this is the loss we want to minimize)
log_likelihood=0.0n=0# Copied from above, but now modifiedforwin['andrejq']:chs=['.']+list(w)+['.']forch1,ch2inzip(chs,chs[1:]):# Neat way for two char 'sliding-window'ix1=stoi[ch1]ix2=stoi[ch2]prob=P[ix1,ix2]logprob=torch.log(prob)log_likelihood+=logprobn+=1print(f'{ch1}{ch2}: {prob:.4f}{logprob:.4f}')print(f'\n{log_likelihood=}')# As this is a tensor and we want to see that toonll=-log_likelihoodprint(f'{nll=}')print(f'{nll/n}')
P=(N+1).float()# Adding a lot more means smoothing out distributions more; see NN approach for discussing this# This is allowed with PyTorch:P/=P.sum(1,keepdims=True)# sum: A 27x1 vector (1 stands for row-wise sum)
我们在 N 上加上 +1,这样我们就避免了由 \(log\) 引起的 \(\infty\)。
在平滑处理后,重新运行与之前相同的代码,现在给二元组 jq 赋予了一个(非常小的)概率。
log_likelihood=0.0n=0# Copied from above, but now modifiedforwin['andrejq']:chs=['.']+list(w)+['.']forch1,ch2inzip(chs,chs[1:]):# Neat way for two char 'sliding-window'ix1=stoi[ch1]ix2=stoi[ch2]prob=P[ix1,ix2]logprob=torch.log(prob)log_likelihood+=logprobn+=1print(f'{log_likelihood=}')# As this is a tensor and we want to see that toonll=-log_likelihoodprint(f'{nll=}')print(f'{nll/n}')
#Create training set of all bigramsxs,ys=[],[]# Input and output character indicesforwinwords:chs=['.']+list(w)+['.']forch1,ch2inzip(chs,chs[1:]):ix1=stoi[ch1]ix2=stoi[ch2]xs.append(ix1)ys.append(ix2)# Convert lists to tensorsxs=torch.tensor(xs)ys=torch.tensor(ys)
#Create training set of one particular bigramxs,ys=[],[]forwinwords[:1]:chs=['.']+list(w)+['.']forch1,ch2inzip(chs,chs[1:]):ix1=stoi[ch1]ix2=stoi[ch2]print(f'{ch1}{ch2}: {ix1} -> {ix2}')xs.append(ix1)ys.append(ix2)xs=torch.tensor(xs)ys=torch.tensor(ys)print(xs)print(ys)
xenc=F.one_hot(xs,num_classes=27).float()# num_classes removes the need for F's guessingxenc.shape# For '.emma' this will be [5, 27]plt.imshow(xenc)# '. e m m a' (remember, this is the input, output would be 'e m m a .' for this example)
W=torch.randn((27,1),generator=g)# the neuron: random column vector of 27 numbers from normal distributiona=xenc@W# '@' is PyTorch's matrix multiplication operator (5x27 @ 27x1 -> 5x1)print(a)# this is now a 5x1 vector
W=torch.randn((27,27),generator=g)# random column matrix of 27x27 numbers (previous was 27x1 for a single neuron)a=xenc@W# @ is PyTorch's matrix multiplication operator, this is now a 5x27 vectorprint(a)# this is now a 5x27 vector
logits=xenc@W# logits, different word for log-counts# These two combined are called Softmax -> Build a probability distribution from logitscounts=logits.exp()# negative numbers are positive below 1, positive numbers are positive above 1# Let's just say the counts variable holds something like 'fake counts', kinda like in the N matrix of bigram, we process them just the sameprobs=counts/counts.sum(1,keepdims=True)# Normal distribution probabilitiesprint(probs.shape)# 5x27, as expectedprint(probs[0].sum())# Will be 1. for any index [0-4]
# FORWARD-PASS:xenc=F.one_hot(xs,num_classes=27).float()# one-hot encode the nameslogits=xenc@W# logits, different word for log-counts# Softmax as part of forward passcounts=logits.exp()# 'fake counts', kinda like in the N matrix of bigramprobs=counts/counts.sum(1,keepdims=True)# Normal distribution probabilitiesprint(probs.shape)
nlls=torch.zeros(len(xs))# 5# Five bigrams making up '.emma.'foriinrange(len(xs)):#i-th bigramx=xs[i].item()# input character-indexy=ys[i].item()# output character-indexprint("\n-------\n")print(f'bigram example tuple {i+1}: ("{itos[x]}", "{itos[y]}") (indexes ({x}, {y}))')# Input is index x, expected output is index yprint('\t>> input to the neural net:',x,f'({itos[x]})')# Again, x, the index, is the NN's inputprint('\t>> output probabilities from the neural net:',probs[i])# We built probs in the above cellprint('\t>> most likely next character:',itos[probs[i].argmax().item()],f'(index {probs[i].argmax().item()}, likelihood {probs[i].max().item()})')# argmax() returns the index of the highest value in probs[i]print('\t>> label (actual next character):',y)p=probs[i,y]print('\t>> probability assigned by the net to the correct character:',p.item())logp=torch.log(p)print('\t>> log likelihood:',logp.item())nll=-logpprint('\t>> negative log likelihood:',nll.item())nlls[i]=nllprint('\n============\n')print('average negative log likelihood, i.e. loss =',nlls.mean().item())
-------
bigram example tuple 1: (".", "e") (indexes (0, 5))
>> input to the neural net: 0 (.)
>> output probabilities from the neural net: tensor([0.0360, 0.0688, 0.0065, 0.0839, 0.0135, 0.0188, 0.0079, 0.0926, 0.0134,
0.0093, 0.0383, 0.0298, 0.0210, 0.0887, 0.0142, 0.0530, 0.0123, 0.0526,
0.0787, 0.0087, 0.0462, 0.0592, 0.0055, 0.0236, 0.0135, 0.0466, 0.0573])
>> most likely next character: g (index 7, likelihood 0.09264726936817169)
>> label (actual next character): 5
>> probability assigned by the net to the correct character: 0.018827954307198524
>> log likelihood: -3.972412586212158
>> negative log likelihood: 3.972412586212158
-------
bigram example tuple 2: ("e", "m") (indexes (5, 13))
>> input to the neural net: 5 (e)
>> output probabilities from the neural net: tensor([0.0582, 0.0233, 0.0303, 0.0744, 0.0082, 0.0142, 0.0065, 0.0682, 0.0113,
0.0420, 0.0122, 0.0819, 0.0327, 0.0105, 0.0561, 0.0050, 0.0179, 0.0087,
0.1070, 0.0610, 0.0199, 0.0405, 0.0109, 0.1517, 0.0185, 0.0202, 0.0088])
>> most likely next character: w (index 23, likelihood 0.15169577300548553)
>> label (actual next character): 13
>> probability assigned by the net to the correct character: 0.010476764291524887
>> log likelihood: -4.558595657348633
...
============
average negative log likelihood, i.e. loss = 4.237281322479248
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
# Activations of the respective character inputprint('intput ".", output "e":',probs[0,5])# input: ., probability shown for: e to be drawnprint('intput "e", output "m":',probs[1,13])print('intput "m", output "m":',probs[2,13])print('intput "m", output "a":',probs[3,1])print('intput "a", output ".":',probs[4,0])
# Over the length of probs (dimension 0), we plug out per row (a row is 27 wide) the corresponding index stated in ys at the same indexprobs[torch.arange(len(probs)),ys]# The probabilities the NN assigns to the correct next character
g=torch.Generator().manual_seed(2147483647)W=torch.randn((27,27),device=torch.device("cpu"),generator=g,requires_grad=True)# random column matrix of 27x27 numbers (requires_grad=True for autograd)
# FORWARD-PASS:xenc=F.one_hot(xs,num_classes=27).float()# one-hot encode the nameslogits=xenc@W# logits, different word for log-counts# Softmax:counts=logits.exp()# 'fake counts', kinda like in the N matrix of bigramprobs=counts/counts.sum(1,keepdims=True)# Normal distribution probabilities (this is y_pred)loss=-probs[torch.arange(len(probs)),ys].log().mean()print('Loss:',loss.item())
W.grad=None# Make sure all gradients are reset to zeroloss.backward()# Torch kept track of what this variable is, kinda cool# Looking at Backward-Pass' impactW.grad# There's now stuff inside hereprint(W.shape)# 27x27 gradients for the neuronsprint(W.grad[0,0])# First neuron's value for letter probability of '.' should be raised by this to cause more loss
# Create training set of one particular bigramxs,ys=[],[]forwinwords:chs=['.']+list(w)+['.']forch1,ch2inzip(chs,chs[1:]):ix1=stoi[ch1]ix2=stoi[ch2]xs.append(ix1)ys.append(ix2)xs=torch.tensor(xs)ys=torch.tensor(ys)num=xs.nelement()print('number of examples',num)
# Initialize the networkg=torch.Generator(device=device).manual_seed(2147483647)W=torch.randn((27,27),device=device,generator=g,requires_grad=True)# random column matrix of 27x27 numbers (requires_grad=True for autograd)
# Training cycles, using the entire dataset -> 200 Epochsforkinrange(200):# Forward passxenc=F.one_hot(xs,num_classes=27).float().to(device)# one-hot encode the nameslogits=xenc@W# logits, different word for log-countscounts=logits.exp()# 'fake counts', kinda like in the N matrix of bigramprobs=counts/counts.sum(1,keepdims=True)# Normal distribution probabilities (this is y_pred)loss=-probs[torch.arange(len(probs)),ys].log().mean()print(f'Loss @ iteration {k+1}: {loss}')# Backward passW.grad=None# Make sure all gradients are resetloss.backward()# Torch kept track of what this variable is, kinda cool# Weight updateW.data+=-50*W.grad
Loss @ iteration 1: 3.7109203338623047
Loss @ iteration 2: 3.3591017723083496
Loss @ iteration 3: 3.136110305786133
Loss @ iteration 4: 2.996102809906006
Loss @ iteration 5: 2.901326894760132
Loss @ iteration 6: 2.832087278366089
Loss @ iteration 7: 2.780813217163086
Loss @ iteration 8: 2.7420215606689453
Loss @ iteration 9: 2.7117466926574707
Loss @ iteration 10: 2.6873340606689453
Loss @ iteration 11: 2.6671009063720703
Loss @ iteration 12: 2.6499812602996826
Loss @ iteration 13: 2.635272264480591
Loss @ iteration 14: 2.622483253479004
Loss @ iteration 15: 2.6112561225891113
Loss @ iteration 16: 2.6013193130493164
Loss @ iteration 17: 2.5924623012542725
Loss @ iteration 18: 2.584519386291504
Loss @ iteration 19: 2.5773584842681885
Loss @ iteration 20: 2.5708725452423096
Loss @ iteration 21: 2.564974546432495
Loss @ iteration 22: 2.5595920085906982
Loss @ iteration 23: 2.554664134979248
Loss @ iteration 24: 2.550138235092163
Loss @ iteration 25: 2.5459702014923096
...
Loss @ iteration 197: 2.462291717529297
Loss @ iteration 198: 2.4622433185577393
Loss @ iteration 199: 2.46219539642334
Loss @ iteration 200: 2.4621477127075195
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
# Training cycles, using the entire dataset -> 200 Epochsforkinrange(200):# Forward passxenc=F.one_hot(xs,num_classes=27).float().to(device)# one-hot encode the nameslogits=xenc@W# logits, different word for log-countscounts=logits.exp()# 'fake counts', kinda like in the N matrix of bigramprobs=counts/counts.sum(1,keepdims=True)# Normal distribution probabilities (this is y_pred)loss=-probs[torch.arange(len(probs)),ys].log().mean()+0.01*(W**2).mean()print(f'Loss @ iteration {k+1}: {loss}')# Backward passW.grad=None# Make sure all gradients are resetloss.backward()# Torch kept track of what this variable is, kinda cool# Weight updateW.data+=-50*W.grad
Loss @ iteration 1: 2.4835662841796875
Loss @ iteration 2: 2.4835257530212402
Loss @ iteration 3: 2.483488082885742
Loss @ iteration 4: 2.483452796936035
Loss @ iteration 5: 2.483419179916382
Loss @ iteration 6: 2.483386516571045
Loss @ iteration 7: 2.4833555221557617
Loss @ iteration 8: 2.4833250045776367
Loss @ iteration 9: 2.4832959175109863
Loss @ iteration 10: 2.4832677841186523
Loss @ iteration 11: 2.4832401275634766
Loss @ iteration 12: 2.4832139015197754
Loss @ iteration 13: 2.483187437057495
Loss @ iteration 14: 2.4831619262695312
Loss @ iteration 15: 2.483137607574463
Loss @ iteration 16: 2.4831132888793945
Loss @ iteration 17: 2.4830894470214844
Loss @ iteration 18: 2.4830663204193115
Loss @ iteration 19: 2.483043909072876
Loss @ iteration 20: 2.4830214977264404
Loss @ iteration 21: 2.482999801635742
Loss @ iteration 22: 2.482978343963623
Loss @ iteration 23: 2.482957601547241
Loss @ iteration 24: 2.4829370975494385
Loss @ iteration 25: 2.482917070388794
...
Loss @ iteration 197: 2.481318712234497
Loss @ iteration 198: 2.481313943862915
Loss @ iteration 199: 2.481309652328491
Loss @ iteration 200: 2.4813051223754883
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
# Finally, sample from this neural network model# (This structure is copied from the bigram approach)g=torch.Generator(device=device).manual_seed(2147483642)foriinrange(5):out=[]ix=0whileTrue:# ----------# BEFORE:#p = P[ix] # Bigram explicit probability approach# ----------# NOW:xenc=F.one_hot(torch.tensor([ix]),num_classes=27).float().to(device)logits=xenc@W# predict log-countscounts=logits.exp()# counts, equivalent to Np=counts/counts.sum(1,keepdims=True)# probabilities for next character# ----------ix=torch.multinomial(p,num_samples=1,replacement=True,generator=g).item()out.append(itos[ix])ifix==0:breakprint(''.join(out))