# !pip install IPython 
from IPython.display import Image 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')


def preprocess(text):
    text = text.lower()
    text = text.replace('.',' .') # 단어와 '.'를 분리하기 위해서
    words = text.split()# 공백을 기준으로 분리
    
    word_to_id ={} # word에 id 부여
    id_to_word ={} #각id에 word매칭
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    corpus = np.array([word_to_id[w] for w in words])
    return corpus, word_to_id, id_to_word # 말뭉치, word_to_id, id_to_word dict data return


text = 'You say goodbye and I say Hello.'
corpus, word_to_id, id_to_word = preprocess(text)
corpus, word_to_id, id_to_word

(array([0, 1, 2, 3, 4, 1, 5, 6]),
 {'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6},
 {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'})


def create_to_matrix(corpus, vocab_size, window_size=1): # 동시발생행렬
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype = np.int32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size +1):
            left_idx = idx -i
            right_idx = idx +i
            
            if left_idx >=0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] +=1
                
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] +=1
                
    return co_matrix


C = create_to_matrix(corpus, len(corpus))
C

array([[0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])


C[word_to_id['goodbye']]

array([0, 1, 0, 1, 0, 0, 0, 0])


def cos_similarity(x, y, eps =1e-8):
    nx = x/(np.sqrt(np.sum(x**2))+ eps)
    ny = y/(np.sqrt(np.sum(y**2))+ eps)
    return np.dot(nx, ny)


c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
cos_similarity(c0, c1)

0.7071067691154799


x= np.array([100, -20, 2])
x.argsort() # 오름차순으로 배열원소 정렬

array([1, 2, 0], dtype=int64)


(-x).argsort() # 내림차순으로 정렬됨

array([0, 2, 1], dtype=int64)


def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    # 검색어를 꺼낸다
    if query not in word_to_id:
        print('%s를 찾을 수 없습니다.'%query)
        return
    
    print('\n[query]' +query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]
    
    # 코사인 유사도 계산
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)
        
    # 코사인 유사도를 기준으로 내림차순
    count = 0
    for i in (-1*similarity).argsort():
        if id_to_word[i] == query:
            continue
        print('%s:%s'%(id_to_word[i], similarity[i]))
        
        count +=1
        if count>= top:
            return


most_similar('you', word_to_id, id_to_word, C)

[query]you
goodbye:0.7071067691154799
i:0.7071067691154799
hello:0.7071067691154799
say:0.0
and:0.0


def ppmi(C, verbose=False, eps=1e-8):
    """
    C:동시발행 행렬
    verbose: 진행상활 출력여부 flag
    eps: 0으로 나누어 에러가 발생하지 않도록 
    """
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0
    
    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N /(S[j]* S[i])+ eps)
            M[i, j] = max(0, pmi)
            
            if verbose:
                cnt+=1
                if cnt%(total//100+1) ==0:
                    print('%.f%%완료'%(100*(cnt/total)))
    return M


W = ppmi(C)
np.set_printoptions(precision=3) #유효자릿수를 세자리로 표시
print('동시발생행렬')
print(C)
print('-'*50)
print('PPMI')
print(W)

동시발생행렬
[[0 1 0 0 0 0 0 0]
 [1 0 1 0 1 1 0 0]
 [0 1 0 1 0 0 0 0]
 [0 0 1 0 1 0 0 0]
 [0 1 0 1 0 0 0 0]
 [0 1 0 0 0 0 1 0]
 [0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0]]
--------------------------------------------------
PPMI
[[0.    1.807 0.    0.    0.    0.    0.    0.   ]
 [1.807 0.    0.807 0.    0.807 0.807 0.    0.   ]
 [0.    0.807 0.    1.807 0.    0.    0.    0.   ]
 [0.    0.    1.807 0.    1.807 0.    0.    0.   ]
 [0.    0.807 0.    1.807 0.    0.    0.    0.   ]
 [0.    0.807 0.    0.    0.    0.    2.807 0.   ]
 [0.    0.    0.    0.    0.    2.807 0.    0.   ]
 [0.    0.    0.    0.    0.    0.    0.    0.   ]]


# SVD
U, S, V = np.linalg.svd(W)
print(C[0]) # 동시발생행렬
print(W[0]) # PPMI행렬
print(U[0]) #SVD
print(U[0,:2]) #2차원 벡터

[0 1 0 0 0 0 0 0]
[0.    1.807 0.    0.    0.    0.    0.    0.   ]
[ 3.409e-01 -1.110e-16 -3.886e-16 -1.205e-01 -9.323e-01  0.000e+00
  0.000e+00  1.958e-17]
[ 3.409e-01 -1.110e-16]


for word, word_id in word_to_id.items():
    plt.annotate(word, (U[word_id,0], U[word_id, 1]))
plt.scatter(U[:,0], U[:,1], alpha=0.5)
plt.show()

[Deep Learning from Scratch 2] chapter 4. word2vec 속도개선 (0)	2023.04.26
[Deep Learning from Scratch 2] chapter 3. word2vec (0)	2023.04.26
[Deep Learning from Scratch] chapter 8-4. 딥러닝 NIC, DCGAN (0)	2023.04.25
[Deep Learning from Scratch] chapter 8-3. 딥러닝 FCN (0)	2023.04.25
[Deep Learning from Scratch] chapter 8-2. 딥러닝 R-CNN (0)	2023.04.25

Sunny Finance & Tech Blog

[Deep Learning from Scratch 2] chapter 2. 자연어와 단어의 분산 표현

자연어와 단어의 분산표현¶

Thesaurus¶

통계 기반 기법¶

말뭉치 전처리( corpus preprocessing)¶

단어의 분산 표현, 분포가설¶

벡터간 유사도¶

유사 단언 랭킹 표시¶

통계기반 기법 개선¶

상호 정보량¶

차원감소( Dimension Reduction)¶

'Data Science > Deep Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바