Bỏ qua

Evaluation API Reference

ModelConfig

class ModelConfig:
    """Cấu hình Vietnamese LLM từ 125M đến 7B."""

    def __init__(
        d_model=768,
        so_dau=12,
        d_ff=3072,
        so_block=12,
        so_tu_vung=32000,
        do_dai_toi_da=2048,
        dropout=0.1,
        ten="custom",
        mo_ta="",
    )

    def from_preset(preset) -> ModelConfig       # classmethod
    def danh_sach_presets() -> dict               # classmethod
    def to_dict() -> dict

    @property
    def so_tham_so(self) -> int
    @property
    def so_tham_so_str(self) -> str               # "125M", "1.3B", ...

Presets

Preset d_model so_dau so_block so_tu_vung Params
vnlm-tiny 64 2 2 1,000 ~10M
vnlm-small 768 12 12 32,000 ~125M
vnlm-medium 1,024 16 24 32,000 ~350M
vnlm-large 2,048 32 24 32,000 ~1.3B
vnlm-xl 2,560 32 32 32,000 ~2.7B
vnlm-7b 4,096 32 32 32,000 ~6.8B

LMEvalHarness

class LMEvalHarness:
    """LM Evaluation Framework."""

    def __init__()

    def dang_ky_task(task: EvalTask) -> None
    def danh_sach_tasks() -> dict
    def danh_gia(model, cac_task=None, so_shot=0, limit=None) -> dict
    def bao_cao() -> str
    def thong_ke() -> dict

danh_gia() Returns

{
    "so_tasks": int,
    "so_shot": int,
    "tong_thoi_gian": float,
    "ket_qua": {
        "task_name": {
            "accuracy": float,
            "perplexity": float,
            # ... metrics tùy task type
        },
    },
    "tong_hop": dict,  # Aggregated metrics
}

EvalTask

class EvalTask:
    """Định nghĩa một evaluation task."""

    def __init__(
        ten: str,
        loai: str = "text_generation",  # classification, perplexity, qa, cloze
        du_lieu: list = None,           # [{text, label}, ...]
        metrics: list = None,           # ["accuracy", "f1", ...]
        mo_ta: str = "",
    )
    def to_dict() -> dict

BenchmarkRunner

class BenchmarkRunner:
    """Benchmark Runner cho Vietnamese LLM."""

    def __init__()

    def chay(model, benchmarks=None, so_shot=0) -> dict
    def bao_cao() -> str
    def thong_ke() -> dict

Benchmarks

Tên Mô tả Metrics
perplexity Perplexity trên corpus tiếng Việt perplexity_mean, perplexity_std
generation Sinh văn bản do_dai_trung_binh, thoi_gian
sentiment Phân loại cảm xúc accuracy
speed Tốc độ inference latency_mean_ms, latency_p95_ms
qa Hỏi đáp accuracy

chay() Returns

{
    "perplexity": {
        "perplexity_mean": float,
        "perplexity_std": float,
        "perplexity_min": float,
        "perplexity_max": float,
        "so_mau": int,
    },
    "generation": {
        "so_prompts": int,
        "do_dai_trung_binh": float,
        "chi_tiet": list,
    },
    "sentiment": {
        "accuracy": float,
        "so_mau": int,
    },
    "speed": {
        "latency_mean_ms": float,
        "latency_p50_ms": float,
        "latency_p95_ms": float,
        "latency_p99_ms": float,
        "so_lan_chay": int,
    },
    "tong_thoi_gian": float,
    "benchmarks": list,
}