2017년, Google의 연구원 8명은 "Attention Is All You Need"라는 논문을 발표했습니다. 그 제목은 도발적이었습니다. 당시 지배적인 견해는 시퀀스(sequence)를 처리하기 위해서는 순환 신경망 (recurrent networks)이 필요하다는 것이었습니다. 메모리와 순차적 처리 (sequential processing)가 필요하다고 믿었습니다. 하지만 이 논문은 그 중 어느 것도 필요하지 않다고 말했습니다. 그저 스마트한 아키텍처에 적용된 어텐션 (attention)만 있으면 된다는 것이었습니다. 그 결과는 더 빠른 학습 속도, 더 쉬운 병렬화 (parallelize), 그리고 거의 모든 벤치마크에서의 더 나은 성능이었습니다. 그들이 논문에서 설명한 Transformer는 BERT, GPT-2, GPT-3, GPT-4, Claude, Gemini, 그리고 우리가 컴퓨터와 상호작용하는 방식을 바꾼 다른 모든 거대 언어 모델 (large language model)의 직계 조상입니다. 이 포스트에서는 전체 Transformer 인코더 (encoder)를 처음부터 구축합니다. 지난 포스트에서 이미 모든 조각을 확보했습니다. 이제 그것들이 하나로 합쳐지는 단계입니다.

전체 아키텍처 (The Full Architecture)
Transformer는 두 가지 주요 구성 요소를 가집니다:

인코더 (Encoder): 입력 시퀀스를 읽고 풍부한 문맥적 표현 (contextual representation)을 구축합니다. BERT는 인코더입니다. 분류 (classification), 개체명 인식 (named entity recognition), 질의응답 (question answering)과 같은 이해 작업에 사용됩니다.

디코더 (Decoder): 출력 토큰 (output tokens)을 하나씩 생성합니다. GPT는 디코더입니다. 텍스트 완성 (text completion), 번역 (translation), 요약 (summarization)과 같은 생성 작업에 사용됩니다.

원래의 "Attention Is All You Need" Transformer는 번역을 위해 두 가지를 모두 사용했습니다. 현대의 LLM은 종종 둘 중 하나만을 사용합니다.

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import math
import warnings

warnings.filterwarnings("ignore")
torch.manual_seed(42)

def attention(Q, K, V, mask=None):
    d_k = Q.shape[-1]
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    return torch.matmul(weights, V), weights

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0
        self.n_heads = n_heads
        self.

d_k = d_model // n_heads
self.W_q = nn.Linear(d_model, d_model, bias=False)
self.W_k = nn.Linear(d_model, d_model, bias=False)
self.W_v = nn.Linear(d_model, d_model, bias=False)
self.W_o = nn.Linear(d_model, d_model, bias=False)

def split_heads(self, x):
    b, s, _ = x.shape
    return x.reshape(b, s, self.n_heads, self.d_k).transpose(1, 2)

def forward(self, q, k, v, mask=None):
    Q = self.split_heads(self.W_q(q))
    K = self.split_heads(self.W_k(k))
    V = self.split_heads(self.W_v(v))
    out, w = attention(Q, K, V, mask)
    out = out.transpose(1, 2).reshape(q.shape[0], -1, self.n_heads * self.d_k)
    return self.W_o(out), w

class FeedForward(nn.Module):
def init(self, d_model, d_ff, dropout=0.1):
super().init()
self.net = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(d_ff, d_model),
nn.Dropout(dropout),
)

def forward(self, x):
    return self.net(x)

class PositionalEncoding(nn.Module):
def init(self, d_model, max_len=5000, dropout=0.1):
super().init()
self.dropout = nn.Dropout(dropout)
pe = torch.zeros(max_len, d_model)
pos = torch.arange(0, max_len).unsqueeze(1).float()
div = torch.pow(10000.0, torch.arange(0, d_model, 2).float() / d_model)
pe[:, 0::2] = torch.sin(pos / div)
pe[:, 1::2] = torch.cos(pos / div)
self.register_buffer("pe", pe.unsqueeze(0))

def forward(self, x):
    return self.dropout(x + self.pe[:, :x.size(1)])

print("Building blocks assembled. Now constructing the full transformer.")

인코더 레이어 (The Encoder Layer)

class EncoderLayer(nn.Module):
def init(self, d_model, n_heads, d_ff, dropout=0.1):
super().init()
self.self_attn = MultiHeadAttention(d_model, n_heads)
self.

ffn = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)

def forward(self, x, mask=None):
    attn_out, _ = self.self_attn(x, x, x, mask)
    x = self.norm1(x + self.dropout(attn_out))
    ffn_out = self.ffn(x)
    x = self.norm2(x + self.dropout(ffn_out))
    return x

class TransformerEncoder(nn.Module):
def init(self, vocab_size, d_model, n_heads, d_ff, n_layers, max_len=512, dropout=0.1):
super().init()
self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
self.pos_enc = PositionalEncoding(d_model, max_len, dropout)
self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])
self.norm = nn.LayerNorm(d_model)

def forward(self, x, mask=None):
    x = self.pos_enc(self.embedding(x) * math.sqrt(self.embedding.embedding_dim))
    for layer in self.layers:
        x = layer(x, mask)
    return self.norm(x)

VOCAB_SIZE = 10000
D_MODEL = 256
N_HEADS = 8
D_FF = 1024
N_LAYERS = 6
MAX_LEN = 512

encoder = TransformerEncoder(VOCAB_SIZE, D_MODEL, N_HEADS, D_FF, N_LAYERS, MAX_LEN)
total_params = sum(p.numel() for p in encoder.parameters())

print(f"Transformer Encoder configuration:")
print(f"vocab_size: {VOCAB_SIZE:,}")
print(f"d_model: {D_MODEL}")
print(f"n_heads: {N_HEADS} (d_k = {D_MODEL // N_HEADS} per head)")
print(f"d_ff: {D_FF}")
print(f"n_layers: {N_LAYERS}")
print(f"max_len: {MAX_LEN}")
print(f"\nTotal parameters: {total_params:,}")
print()

x_ids = torch.randint(1, VOCAB_SIZE, (2, 20))
out = encoder(x_ids)
print(f"Input shape: {x_ids.shape} (batch=2, seq_len=20)")
print(f"Output shape: {out.shape}")

shape: {x_ids.shape} (batch=2, seq_len=20, d_model= { D_MODEL } ) )
print()
print("이제 모든 토큰은 문맥을 인식하는 256차원 표현(representation)을 갖게 됩니다.")
print("'river bank'의 'bank'와 'bank account'의 'bank'는 서로 다른 벡터를 얻습니다.")

class DecoderLayer(nn.Module):
def init(self, d_model, n_heads, d_ff, dropout=0.1):
super().init()
self.self_attn = MultiHeadAttention(d_model, n_heads)
self.cross_attn = MultiHeadAttention(d_model, n_heads)
self.ffn = FeedForward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)

def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
    self_out, _ = self.self_attn(x, x, x, tgt_mask)
    x = self.norm1(x + self.dropout(self_out))
    
    cross_out, _ = self.cross_attn(x, enc_out, enc_out, src_mask)
    x = self.norm2(x + self.dropout(cross_out))
    
    ffn_out = self.ffn(x)
    x = self.norm3(x + self.dropout(ffn_out))
    return x

print("Decoder Layer는 세 개의 서브레이어(sublayers)를 가집니다:")
print()
print("1. MASKED SELF-ATTENTION (마스크된 셀프 어텐션)")
print("인과적 마스크(Causal mask): 각 토큰은 이전 토큰들에만 어텐션(attention)을 수행합니다.")
print("학습 중 미래의 토큰을 미리 보고 '컨닝'하는 것을 방지합니다.")
print()
print("2. CROSS-ATTENTION (크로스 어텐션)")
print("Query는 디코더(decoder)에서, Key+Value는 인코더(encoder) 출력에서 가져옵니다.")
print("디코더가 인코딩된 전체 입력에 어텐션을 수행할 수 있게 합니다.")
print("디코더가 번역하거나 요약하려는 내용을 어떻게 '읽는지'를 결정합니다.")
print()
print("3. FEED-FORWARD (피드 포워드)")
print("인코더와 동일합니다. 각 위치를 독립적으로 처리합니다.")

" ) BERT와 GPT 아키텍처 비교
architectures = {
" BERT-base ": {
" type ": " Encoder only (인코더 전용) ",
" layers ": 12,
" d_model ": 768,
" n_heads ": 12,
" params ": " 110M ",
" training ": " Masked Language Model (MLM) + Next Sentence Prediction (NSP) ",
" use_case ": " Classification (분류), NER (개체명 인식), Q&A (질의응답), Embeddings (임베딩) ",
" attention ": " Bidirectional (양방향, 전체 문맥 파악) ",
},
" BERT-large ": {
" type ": " Encoder only (인코더 전용) ",
" layers ": 24,
" d_model ": 1024,
" n_heads ": 16,
" params ": " 340M ",
" training ": " Masked Language Model (MLM) + Next Sentence Prediction (NSP) ",
" use_case ": " base와 동일하지만 더 높은 정확도 ",
" attention ": " Bidirectional (양방향) ",
},
" GPT-2 ": {
" type ": " Decoder only (디코더 전용) ",
" layers ": 12,
" d_model ": 768,
" n_heads ": 12,
" params ": " 117M ",
" training ": " Next Token Prediction (다음 토큰 예측, 자기회귀적) ",
" use_case ": " Text generation (텍스트 생성), completion (문장 완성) ",
" attention ": " Causal (인과적, 왼쪽에서 오른쪽으로만) ",
},
" GPT-3 ": {
" type ": " Decoder only (디코더 전용) ",
" layers ": 96,
" d_model ": 12288,
" n_heads ": 96,
" params ": " 175B ",
" training ": " 대규모 스케일의 Next Token Prediction (다음 토큰 예측) ",
" use_case ": " Few-shot learning (퓨샷 학습), generation (생성) ",
" attention ": " Causal (인과적) ",
},
" T5-base ": {
" type ": " Encoder-Decoder (인코더-디코더) ",
" layers ": " 12+12 ",
" d_model ": 768,
" n_heads ": 12,
" params ": " 220M ",
" training ": " Text-to-Text format (텍스트-투-텍스트 형식), span masking (스팬 마스킹) ",
" use_case ": " Translation (번역), summarization (요약), Q&A (질의응답) ",
" attention ": " Encoder=bidirectional (인코더=양방향), Decoder=causal (디코더=인과적) ",
},
}
print ( f " { ' Model ' : < 14 } { ' Type ' : < 20 } { ' Layers ' : > 8 } { ' d_model ' : > 8 } { ' Params ' : > 8 } { ' Attention ' } " )
print ( " = " * 80 )
for name , config in architectures .

items (): print ( f " { name : < 14 } { config [ ' type ' ] : < 20 } { str ( config [ ' layers ' ]) : > 8 } " f " { str ( config [ ' d_model ' ]) : > 8 } { config [ ' params ' ] : > 8 } " f " { config [ ' attention ' ][ : 25 ] } " )

Encoder 클래스를 사용하여 텍스트 분류기(Text Classifier) 구축하기

class TransformerClassifier ( nn . Module ):
def init ( self , vocab_size , d_model , n_heads , d_ff , n_layers , num_classes , max_len = 256 , dropout = 0.1 ):
super (). init ()
self . encoder = TransformerEncoder ( vocab_size , d_model , n_heads , d_ff , n_layers , max_len , dropout )
self . classifier = nn . Sequential (
nn . Linear ( d_model , d_model // 2 ),
nn . GELU (),
nn . Dropout ( dropout ),
nn . Linear ( d_model // 2 , num_classes )
)

def forward ( self , x , mask = None ):
    enc_out = self . encoder ( x , mask )
    cls_rep = enc_out [:, 0 , :]
    return self . classifier ( cls_rep )

classifier = TransformerClassifier (
vocab_size = 10000 ,
d_model = 128 ,
n_heads = 4 ,
d_ff = 512 ,
n_layers = 3 ,
num_classes = 2
)

params = sum ( p . numel () for p in classifier . parameters ())
batch_ids = torch . randint ( 1 , 10000 , ( 4 , 30 ))
logits = classifier ( batch_ids )

print ( f " Text Classifier (sentiment analysis setup): " )
print ( f " Parameters: { params : , } " )
print ( f " Input shape: { batch_ids . shape } (4 sentences, 30 tokens each) " )
print ( f " Output shape: { logits . shape } (4 sentences, 2 classes) " )
print ()
print ( " The [CLS] token (position 0) aggregates the full sequence. " )
print ( " Its representation is used for classification. " )
print ( " This is exactly how BERT does classification tasks. " )

작은 Transformer 학습시키기

from torch.utils.data import DataLoader , Dataset

class FakeTextDataset ( Dataset ):
def init ( self , n_samples = 1000 , seq_len = 20 , vocab_size = 1000 ):
self . data = torch . randint ( 1 , vocab_size , ( n_samples , seq_len ))
self . labels = torch . randint ( 0 , 2 , ( n_samples ,))

def __len__ ( self ):
    return len ( self .

data ) def getitem ( self , i ): return self . data [ i ], self . labels [ i ] dataset = FakeTextDataset ( n_samples = 2000 , seq_len = 20 , vocab_size = 5000 ) train_size = int ( 0.8 * len ( dataset )) test_size = len ( dataset ) - train_size train_ds , test_ds = torch . utils . data . random_split ( dataset , [ train_size , test_size ]) train_loader = DataLoader ( train_ds , batch_size = 32 , shuffle = True ) test_loader = DataLoader ( test_ds , batch_size = 64 , shuffle = False ) model_small = TransformerClassifier ( vocab_size = 5000 , d_model = 64 , n_heads = 4 , d_ff = 256 , n_layers = 2 , num_classes = 2 , dropout = 0.1 ) device = torch . device ( " cuda " if torch . cuda . is_available () else " cpu " ) model_small = model_small . to ( device ) optimizer = torch . optim . AdamW ( model_small . parameters (), lr = 3e-4 , weight_decay = 0.01 ) criterion = nn . CrossEntropyLoss () print ( f " Training small transformer ( { sum ( p . numel () for p in model_small . parameters ()) : , } params): " ) print ( f " { ' Epoch ' : > 6 } { ' Train Loss ' : > 12 } { ' Train Acc ' : > 10 } { ' Test Acc ' : > 10 } " ) print ( " = " * 42 ) for epoch in range ( 10 ): model_small . train () total_loss = corr

80. Transformer: 모든 것을 바꾼 아키텍처

요약

핵심 포인트

인코더 레이어 (The Encoder Layer)

댓글