GPT & PreTrainer API Reference¶
GPTModel¶
class GPTModel:
"""GPT-style Decoder-Only Transformer."""
def __init__(
d_model=128, # Hidden dimension
so_dau=4, # Số attention heads
d_ff=512, # Feed-forward dimension
so_block=4, # Số transformer blocks
so_tu_vung=5000, # Vocabulary size
do_dai_toi_da=1024, # Max sequence length
dropout=0.1, # Dropout rate
)
def tien(input_ids) -> np.ndarray # Forward: (batch, seq_len, vocab)
def sinh_tiep(input_ids, so_token, nhiet_do, top_k, top_p) -> np.ndarray
def tinh_loss(input_ids, targets) -> float
def thong_ke() -> dict
Methods¶
tien(input_ids) -> np.ndarray¶
Forward pass. Trả về logits cho next token prediction.
- input_ids:
(batch, seq_len)- Token IDs - Returns:
(batch, seq_len, so_tu_vung)- Logits
sinh_tiep(input_ids, so_token=50, nhiet_do=1.0, top_k=0, top_p=0.0)¶
Sinh token tiếp theo tự hồi quy.
| Parameter | Type | Mặc định | Mô tả |
|---|---|---|---|
input_ids |
np.ndarray | required | Token IDs ban đầu |
so_token |
int | 50 |
Số token cần sinh |
nhiet_do |
float | 1.0 |
Temperature (0.1=conservative, 2.0=creative) |
top_k |
int | 0 |
Top-k sampling (0=disabled) |
top_p |
float | 0.0 |
Nucleus sampling (0=disabled) |
tinh_loss(input_ids, targets) -> float¶
Tính cross-entropy loss.
- input_ids:
(batch, seq_len)- Input tokens - targets:
(batch, seq_len)- Target tokens
CausalSelfAttention¶
class CausalSelfAttention:
"""Causal (masked) self-attention cho decoder."""
def __init__(d_model, so_dau, dropout=0.1)
def tien(X) -> np.ndarray
GPTBlock¶
class GPTBlock:
"""GPT Transformer Block: CausalAttention + FFN + LayerNorm."""
def __init__(d_model, so_dau, d_ff, dropout=0.1)
def tien(X) -> np.ndarray
TextDataset¶
class TextDataset:
"""Dataset cho pre-training."""
def __init__(do_dai_window=512, buoc_nhay=256, seed=42)
def tai_corpus(cac_van_ban, vocab_size=5000) -> dict
def chia_du_lieu(ty_le_val=0.1, seed=None) -> dict
def iter_batches(batch_size=32, che_do="train") -> generator
def ma_hoa(text) -> list
def giai_ma(ids) -> str
def thong_ke() -> dict
@property
def vocab_size(self) -> int
@property
def so_chunks(self) -> int
Methods¶
tai_corpus(cac_van_ban, vocab_size=5000) -> dict¶
Load corpus và tokenize.
ket_qua = dataset.tai_corpus(
["văn bản 1", "văn bản 2"],
vocab_size=5000,
)
# Returns: {"so_van_ban": 2, "so_tokens": 150, "so_chunks": 10, "vocab_size": 200}
iter_batches(batch_size, che_do) -> generator¶
Yield (input_ids, targets) batches.
for input_ids, targets in dataset.iter_batches(32):
# input_ids: (32, window-1)
# targets: (32, window-1)
loss = model.tinh_loss(input_ids, targets)
PreTrainer¶
class PreTrainer:
"""Pre-training trainer cho GPT-style models."""
def __init__(
so_vong=10,
kich_thuoc_batch=32,
toc_do_hoc=3e-4,
gradient_accumulation=1,
warmup_steps=100,
weight_decay=0.1,
gradient_clip=1.0,
logging_steps=10,
eval_steps=500,
seed=42,
)
def huan_luyen(model, dataset, callback=None) -> dict
def lay_lich_su() -> dict
def thong_ke() -> dict