Metrics¶

The metrics system provides per-sample scoring, corpus-level aggregation, and reduction functions for handling repeated evaluations.

Source

Built-in Metrics¶

Loglikelihood Metrics¶

acc ¶

acc(references: int | list[int], predictions: LLResults, multiple_targets=False) -> int

Accuracy.

For multiple-choice (multiple lls): 1 if argmax(lls) matches gold. For a single loglikelihood (one ll): 1 if the continuation was decoded greedily.

Source code in lm_eval/api/metrics/ll.py

@metric(
    "acc",
    higher_is_better=True,
    output_type=["loglikelihood", "multiple_choice"],
    aggregation="mean",
)
def acc(
    references: int | list[int], predictions: LLResults, multiple_targets=False
) -> int:
    """Accuracy.

    For multiple-choice (multiple lls): 1 if argmax(lls) matches gold.
    For a single loglikelihood (one ll): 1 if the continuation was decoded greedily.
    """
    if len(predictions.lls) == 1:
        # Plain loglikelihood: acc = greedy decode match
        return int(predictions.is_greedy[0])
    pred = int(np.argmax(predictions.lls))
    return _check_acc(references, pred, multiple_targets)

acc_norm ¶

acc_norm(references: int | list[int], predictions: LLResults, multiple_targets=False) -> int

Character-length-normalised accuracy: picks the choice with the highest ll / char_len.

Source code in lm_eval/api/metrics/ll.py

@metric(
    "acc_norm",
    higher_is_better=True,
    output_type="multiple_choice",
    aggregation="mean",
)
def acc_norm(
    references: int | list[int], predictions: LLResults, multiple_targets=False
) -> int:
    """Character-length-normalised accuracy: picks the choice with the highest ``ll / char_len``."""
    pred = np.argmax(predictions.lls / predictions.char_len()).item()
    return _check_acc(references, pred, multiple_targets)

acc_bytes ¶

acc_bytes(references: int | list[int], predictions: LLResults, multiple_targets=False) -> int

Byte-length-normalised accuracy: picks the choice with the highest ll / byte_len.

Source code in lm_eval/api/metrics/ll.py

@metric(
    "acc_bytes",
    higher_is_better=True,
    output_type="multiple_choice",
    aggregation="mean",
)
def acc_bytes(
    references: int | list[int], predictions: LLResults, multiple_targets=False
) -> int:
    """Byte-length-normalised accuracy: picks the choice with the highest ``ll / byte_len``."""
    pred = np.argmax(predictions.lls / predictions.byte_len()).item()
    return _check_acc(references, pred, multiple_targets)

acc_mutual_info_fn ¶

acc_mutual_info_fn(references: int | list[int], predictions: LLResults, multiple_targets=False) -> int

Mutual-information-weighted accuracy: picks the choice with the highest ll - ll_unconditional.

Source code in lm_eval/api/metrics/ll.py

@metric(
    "acc_mutual_info",
    higher_is_better=True,
    output_type="multiple_choice",
    aggregation="mean",
)
def acc_mutual_info_fn(
    references: int | list[int], predictions: LLResults, multiple_targets=False
) -> int:
    """Mutual-information-weighted accuracy: picks the choice with the highest ``ll - ll_unconditional``."""
    pred = np.argmax(predictions.lls_mutual_info).item()
    return _check_acc(references, pred, multiple_targets)

exact_match_mc ¶

exact_match_mc(references: int | list[int], predictions: LLResults) -> int

1 if the gold completion was decoded greedily (every token was argmax), else 0.

Source code in lm_eval/api/metrics/ll.py

@metric(
    "exact_match_mc",
    higher_is_better=True,
    output_type="multiple_choice",
    aggregation="mean",
)
def exact_match_mc(references: int | list[int], predictions: LLResults) -> int:
    """1 if the gold completion was decoded greedily (every token was argmax), else 0."""
    if isinstance(references, list):
        return int(
            any(predictions.is_greedy[i] if i != -100 else False for i in references)
        )
    if references == -100:
        return 0
    return int(predictions.is_greedy[int(references)])

bpb ¶

bpb(references: int, predictions: LLResults) -> float

Bits-per-byte of the gold completion: -ll[gold] / byte_len[gold] * NAT_TO_BIT.

Lower is better — measures how many bits the model needs per byte of the correct answer.

Source code in lm_eval/api/metrics/ll.py

@metric(
    "bpb",
    higher_is_better=False,
    output_type=["loglikelihood", "multiple_choice"],
    aggregation="mean",
)
def bpb(references: int, predictions: LLResults) -> float:
    """Bits-per-byte of the gold completion: ``-ll[gold] / byte_len[gold] * NAT_TO_BIT``.

    Lower is better — measures how many bits the model needs per byte of the
    correct answer.
    """
    return (
        -predictions.lls[references] / predictions.byte_len()[references]
    ) * _NAT_TO_BIT

logprob_fn ¶

logprob_fn(references: int, predictions: LLResults) -> float

Raw log-probability of the gold completion (in nats).

Source code in lm_eval/api/metrics/ll.py

@metric(
    "logprob",
    higher_is_better=True,
    output_type=["loglikelihood", "multiple_choice"],
    aggregation="mean",
)
def logprob_fn(references: int, predictions: LLResults) -> float:
    """Raw log-probability of the gold completion (in nats)."""
    return (predictions.lls[references]).item()

brier_score ¶

brier_score(references: int, predictions: LLResults) -> float

Per-sample Brier score: sum of squared errors between softmax probs and one-hot gold.

Source code in lm_eval/api/metrics/ll.py

@metric(
    "brier_score",
    higher_is_better=False,
    output_type="multiple_choice",
    aggregation="mean",
)
def brier_score(references: int, predictions: LLResults) -> float:
    """Per-sample Brier score: sum of squared errors between softmax probs and one-hot gold."""
    probs = _softmax(np.array(predictions.lls))
    one_hot = np.zeros_like(probs)
    one_hot[references] = 1.0
    return (np.sum((probs - one_hot) ** 2)).item()

Generation Metrics¶

exact_match_fn ¶

exact_match_fn(references: list[list[str]], predictions: list[str], *, multiple_targets: Literal[True] = ..., **kwargs) -> dict[str, list[int]]

exact_match_fn(references: list[str], predictions: list[str], *, multiple_targets: Literal[False] = ..., **kwargs) -> dict[str, list[int]]

exact_match_fn(references: list[str] | list[list[str]], predictions: list[str], multiple_targets: bool = False, **kwargs) -> dict[str, list[int]]

Source code in lm_eval/api/metrics/generation.py

@metric(
    "exact_match",
    higher_is_better=True,
    output_type="generate_until",
    aggregation="mean",
    reduction="pass@k",
)
def exact_match_fn(
    references: list[str] | list[list[str]],
    predictions: list[str],
    multiple_targets: bool = False,
    **kwargs,
) -> dict[str, list[int]]:
    if multiple_targets:
        # references[0] is a list of acceptable target strings;
        # score 1 if the prediction matches *any* target.
        targets = references[0] if isinstance(references[0], list) else references
        n_targets = len(targets)
        # Cross-product: repeat each pred T times, tile targets P times
        expanded_preds = [p for p in predictions for _ in range(n_targets)]
        expanded_refs = list(targets) * len(predictions)
        result = _exact_match_hf_evaluate(
            predictions=expanded_preds,
            references=expanded_refs,
            **kwargs,
        )
        # Reshape to (P, T) and collapse: match if *any* target matches
        scores = (
            np.array(result["exact_match"])
            .reshape(len(predictions), n_targets)
            .any(axis=1)
            .astype(int)
            .tolist()
        )
        return {"exact_match": scores}
    return _exact_match_hf_evaluate(
        predictions=predictions, references=cast("list[str]", references), **kwargs
    )

Aggregation Functions¶

mean ¶

mean(arr)

Source code in lm_eval/api/metrics/aggregations.py

@aggregation("mean")
def mean(arr):
    return sum(arr) / len(arr)

median ¶

median(arr)

Source code in lm_eval/api/metrics/aggregations.py

@aggregation("median")
def median(arr):
    return sorted(arr)[len(arr) // 2]

nanmean ¶

nanmean(arr)

Source code in lm_eval/api/metrics/aggregations.py

@aggregation("nanmean")
def nanmean(arr):
    if len(arr) == 0 or all(np.isnan(arr)):
        return np.nan
    return np.nanmean(arr)

weighted_mean ¶

weighted_mean(items)

Source code in lm_eval/api/metrics/aggregations.py

def weighted_mean(items):
    a, b = zip(*items, strict=True)
    return sum(a) / sum(b)

perplexity ¶

perplexity(items)

Source code in lm_eval/api/metrics/aggregations.py

@aggregation("perplexity")
def perplexity(items):
    return math.exp(-mean(items))

weighted_perplexity ¶

weighted_perplexity(items)

Source code in lm_eval/api/metrics/aggregations.py

@aggregation("weighted_perplexity")
def weighted_perplexity(items):
    return math.exp(-weighted_mean(items))

bits_per_byte ¶

bits_per_byte(items)

Source code in lm_eval/api/metrics/aggregations.py

@aggregation("bits_per_byte")
def bits_per_byte(items):
    return -weighted_mean(items) / math.log(2)

Corpus-Level Metrics¶

Metrics that must operate across the entire corpus rather than per-sample.

Perplexity ¶

Bases: CorpusMetric['LLResults', float]


              flowchart TD
              lm_eval.api.metrics.Perplexity[Perplexity]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.Perplexity
                


              click lm_eval.api.metrics.Perplexity href "" "lm_eval.api.metrics.Perplexity"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

Corpus-level perplexity for loglikelihood tasks.

Per-document: extracts the gold log-likelihood. Aggregation: exp(-mean(lls)) across all documents.

Functions¶

call ¶

__call__(references: int | list[int], predictions: LLResults) -> float

Source code in lm_eval/api/metrics/corpus.py

def __call__(self, references: int | list[int], predictions: LLResults) -> float:
    if len(predictions.lls) == 1:
        return float(predictions.lls[0])
    return float(predictions.lls[references])

aggregation ¶

aggregation(items: Sequence[float]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[float]) -> float:
    return math.exp(-sum(items) / len(items))

WordPerplexity ¶

Bases: CorpusMetric['LLResults', tuple[float, int]]


              flowchart TD
              lm_eval.api.metrics.WordPerplexity[WordPerplexity]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.WordPerplexity
                


              click lm_eval.api.metrics.WordPerplexity href "" "lm_eval.api.metrics.WordPerplexity"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

Corpus-level word perplexity for rolling loglikelihood tasks.

Computes the exponentiated average negative log-likelihood per word across all documents, weighted by word count.

Lower scores are better.

Functions¶

call ¶

__call__(references: int, predictions: LLResults) -> tuple[float, int]

Source code in lm_eval/api/metrics/corpus.py

def __call__(self, references: int, predictions: LLResults) -> tuple[float, int]:
    return float(predictions.lls[references]), int(
        predictions.word_len()[references]
    )

aggregation ¶

aggregation(items: Sequence[tuple[float, int]]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[tuple[float, int]]) -> float:
    return math.exp(-_weighted_mean(items))

BytePerplexity ¶

Bases: CorpusMetric['LLResults', tuple[float, int]]


              flowchart TD
              lm_eval.api.metrics.BytePerplexity[BytePerplexity]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.BytePerplexity
                


              click lm_eval.api.metrics.BytePerplexity href "" "lm_eval.api.metrics.BytePerplexity"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

Corpus-level byte perplexity for rolling loglikelihood tasks.

Computes the exponentiated average negative log-likelihood per byte across all documents, weighted by byte count.

Lower scores are better.

Functions¶

call ¶

__call__(references: int, predictions: LLResults) -> tuple[float, int]

Source code in lm_eval/api/metrics/corpus.py

def __call__(self, references: int, predictions: LLResults) -> tuple[float, int]:
    return float(predictions.lls[references]), int(
        predictions.byte_len()[references]
    )

aggregation ¶

aggregation(items: Sequence[tuple[float, int]]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[tuple[float, int]]) -> float:
    return math.exp(-_weighted_mean(items))

BitsPerByte ¶

Bases: CorpusMetric['LLResults', tuple[float, int]]


              flowchart TD
              lm_eval.api.metrics.BitsPerByte[BitsPerByte]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.BitsPerByte
                


              click lm_eval.api.metrics.BitsPerByte href "" "lm_eval.api.metrics.BitsPerByte"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

Corpus-level bits-per-byte for rolling loglikelihood tasks.

Converts the average negative log-likelihood per byte into bits by dividing by log(2), weighted by byte count across all documents.

Lower scores are better.

Functions¶

call ¶

__call__(references: int, predictions: LLResults) -> tuple[float, int]

Source code in lm_eval/api/metrics/corpus.py

def __call__(self, references: int, predictions: LLResults) -> tuple[float, int]:
    return float(predictions.lls[references]), int(
        predictions.byte_len()[references]
    )

aggregation ¶

aggregation(items: Sequence[tuple[float, int]]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[tuple[float, int]]) -> float:
    return -_weighted_mean(items) / math.log(2)

Bleu ¶

Bases: _SacrebleuCorpusMetric


              flowchart TD
              lm_eval.api.metrics.Bleu[Bleu]
              lm_eval.api.metrics.corpus._SacrebleuCorpusMetric[_SacrebleuCorpusMetric]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus._SacrebleuCorpusMetric --> lm_eval.api.metrics.Bleu
                                lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.corpus._SacrebleuCorpusMetric
                



              click lm_eval.api.metrics.Bleu href "" "lm_eval.api.metrics.Bleu"
              click lm_eval.api.metrics.corpus._SacrebleuCorpusMetric href "" "lm_eval.api.metrics.corpus._SacrebleuCorpusMetric"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

BLEU score for generated text.

The Bilingual Evaluation Understudy Score counts matching n-grams in the candidate translation to n-grams in the reference text.

Higher is better.

Functions¶

aggregation ¶

aggregation(items: Sequence[tuple[list[str], list[str]]]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[tuple[list[str], list[str]]]) -> float:
    import sacrebleu

    refs, preds = zip(*items, strict=True)
    refs, preds = _sacreformat(refs, preds)
    return sacrebleu.corpus_bleu(preds, refs).score

Chrf ¶

Bases: _SacrebleuCorpusMetric


              flowchart TD
              lm_eval.api.metrics.Chrf[Chrf]
              lm_eval.api.metrics.corpus._SacrebleuCorpusMetric[_SacrebleuCorpusMetric]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus._SacrebleuCorpusMetric --> lm_eval.api.metrics.Chrf
                                lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.corpus._SacrebleuCorpusMetric
                



              click lm_eval.api.metrics.Chrf href "" "lm_eval.api.metrics.Chrf"
              click lm_eval.api.metrics.corpus._SacrebleuCorpusMetric href "" "lm_eval.api.metrics.corpus._SacrebleuCorpusMetric"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

chrF++ score for generated text.

chrF++ is based on character n-gram precision and recall enhanced with word n-grams.

Higher is better.

Functions¶

aggregation ¶

aggregation(items: Sequence[tuple[list[str], list[str]]]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[tuple[list[str], list[str]]]) -> float:
    import sacrebleu

    refs, preds = zip(*items, strict=True)
    refs, preds = _sacreformat(refs, preds)
    return sacrebleu.corpus_chrf(preds, refs).score

Ter ¶

Bases: _SacrebleuCorpusMetric


              flowchart TD
              lm_eval.api.metrics.Ter[Ter]
              lm_eval.api.metrics.corpus._SacrebleuCorpusMetric[_SacrebleuCorpusMetric]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus._SacrebleuCorpusMetric --> lm_eval.api.metrics.Ter
                                lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.corpus._SacrebleuCorpusMetric
                



              click lm_eval.api.metrics.Ter href "" "lm_eval.api.metrics.Ter"
              click lm_eval.api.metrics.corpus._SacrebleuCorpusMetric href "" "lm_eval.api.metrics.corpus._SacrebleuCorpusMetric"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

Translation Error Rate for generated text.

Measures the number of edits required to change a system output into one of the references.

Lower is better.

Functions¶

aggregation ¶

aggregation(items: Sequence[tuple[list[str], list[str]]]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[tuple[list[str], list[str]]]) -> float:
    import sacrebleu

    refs, preds = zip(*items, strict=True)
    refs, preds = _sacreformat(refs, preds)
    return sacrebleu.corpus_ter(preds, refs).score

F1 ¶

Bases: CorpusMetric['LLResults', tuple[int, int]]


              flowchart TD
              lm_eval.api.metrics.F1[F1]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.F1
                


              click lm_eval.api.metrics.F1 href "" "lm_eval.api.metrics.F1"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

F1 score for multiple choice tasks.

Computes the maximum F1 score between gold labels and predicted labels (argmax of log-likelihoods).

Higher is better.

Functions¶

call ¶

__call__(references: Any, predictions: LLResults) -> tuple[int, int]

Source code in lm_eval/api/metrics/corpus.py

def __call__(self, references: Any, predictions: LLResults) -> tuple[int, int]:
    pred = int(np.argmax(predictions.lls))
    return references, pred

aggregation ¶

aggregation(items: Sequence[tuple[int, int]]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[tuple[int, int]]) -> float:
    from sklearn.metrics import f1_score

    golds, preds = zip(*items, strict=True)
    return float(np.max(f1_score(golds, preds)))

MCC ¶

Bases: CorpusMetric['LLResults', tuple[int, int]]


              flowchart TD
              lm_eval.api.metrics.MCC[MCC]
              lm_eval.api.metrics.corpus.CorpusMetric[CorpusMetric]

                              lm_eval.api.metrics.corpus.CorpusMetric --> lm_eval.api.metrics.MCC
                


              click lm_eval.api.metrics.MCC href "" "lm_eval.api.metrics.MCC"
              click lm_eval.api.metrics.corpus.CorpusMetric href "" "lm_eval.api.metrics.corpus.CorpusMetric"

Matthews Correlation Coefficient for multiple choice tasks.

Computes MCC between gold labels and predicted labels (argmax of log-likelihoods).

Higher is better.

Functions¶

call ¶

__call__(references: Any, predictions: LLResults) -> tuple[int, int]

Source code in lm_eval/api/metrics/corpus.py

def __call__(self, references: Any, predictions: LLResults) -> tuple[int, int]:
    pred = int(np.argmax(predictions.lls))
    return references, pred

aggregation ¶

aggregation(items: Sequence[tuple[int, int]]) -> float

Source code in lm_eval/api/metrics/corpus.py

def aggregation(self, items: Sequence[tuple[int, int]]) -> float:
    from sklearn.metrics import matthews_corrcoef

    golds, preds = zip(*items, strict=True)
    return float(matthews_corrcoef(golds, preds))

Stderr Functions¶

stderr_for_metric ¶

stderr_for_metric(metric: Callable[[Sequence[T]], float], bootstrap_iters: int) -> Callable[[Sequence[T]], float] | None

Return a function that estimates the standard error of metric(xs).

mean has a closed-form SE (sample_stddev / sqrt(n)).
All other aggregations use bootstrap_stderr with bootstrap_iters draws.
Returns None when bootstrap_iters <= 0.

Source code in lm_eval/api/metrics/stderr.py

def stderr_for_metric(
    metric: Callable[[Sequence[T]], float], bootstrap_iters: int
) -> Callable[[Sequence[T]], float] | None:
    """Return a function that estimates the standard error of `metric(xs)`.

    * ``mean`` has a closed-form SE (``sample_stddev / sqrt(n)``).
    * All other aggregations use ``bootstrap_stderr`` with ``bootstrap_iters`` draws.
    * Returns ``None`` when ``bootstrap_iters <= 0``.
    """
    if bootstrap_iters <= 0:
        # return no function (don't compute stderr) if bootstrap iters = 0
        return None

    # Closed-form stderr when available; bootstrap everything else.
    if metric is mean:
        return mean_stderr

    return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)

bootstrap_stderr ¶

bootstrap_stderr(f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int) -> float

Bootstrap estimate of the standard error of statistic f(xs) using up to iters resamples, chunked (≤ 1000 draws).

Executes in parallel unless LMEVAL_DISABLE_MULTIPROC is set;

Source code in lm_eval/api/metrics/stderr.py

def bootstrap_stderr(
    f: Callable[[Sequence[T]], float], xs: Sequence[T], iters: int
) -> float:
    """Bootstrap estimate of the standard error of statistic `f(xs)` using up to `iters` resamples, chunked (≤ 1000 draws).

    Executes in parallel unless ``LMEVAL_DISABLE_MULTIPROC`` is set;
    """
    if not DISABLE_MULTIPROC:
        import multiprocessing as mp

        # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
        # equivalent to stderr calculated without Bessel's correction in the stddev.
        # Unfortunately, I haven't been able to figure out what the right correction is
        # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
        # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
        # Thankfully, shouldn't matter because our samples are pretty big usually anyways
        res = []
        chunk_size = min(1000, iters)
        from tqdm import tqdm

        print(f"Bootstrapping for stddev: {getattr(f, '__name__', repr(f))}")
        with mp.Pool(mp.cpu_count()) as pool:
            for bootstrap in tqdm(
                pool.imap(
                    _bootstrap_internal(f, chunk_size),
                    [(i, xs) for i in range(iters // chunk_size)],
                ),
                total=iters // chunk_size,
            ):
                # sample w replacement
                res.extend(bootstrap)
    else:
        res = _bootstrap_internal_no_mp(f, xs, iters)

    return sample_stddev(res)

mean_stderr ¶

mean_stderr(arr)

Source code in lm_eval/api/metrics/stderr.py

def mean_stderr(arr):
    return sample_stddev(arr) / math.sqrt(len(arr))

Types¶

The core metric wrapper. Each metric defines a per-sample function, an aggregation strategy, and optionally a reduction for repeated samples.

Metric `dataclass` ¶

Metric(name: str, fn: MetricFn[_T], kwargs: Mapping[str, Any] = dict(), aggregation: AggregationFn[_K] | None = None, higher_is_better: bool = True, output_type: set[str] = (lambda: {'multiple_choice'})(), reduction: ReductionFn[_T, _K] | None = take_first)

Bases: Generic[_T, _K]


              flowchart TD
              lm_eval.api.metrics.Metric[Metric]

              

              click lm_eval.api.metrics.Metric href "" "lm_eval.api.metrics.Metric"

Encapsulates information about a single metric.

This is the canonical representation for metrics used throughout lm_eval.

CLASS TYPE PARAMETER	DESCRIPTION
`_T`	Per-sample result type from `fn`.
`_K`	Reduced type after collapsing repeats via `reduction`.

Type chain: fn(...) -> _T, reduction(...) -> _K, aggregation(Sequence[_K]) -> float.

Attributes¶

name `instance-attribute` ¶

name: str

fn `instance-attribute` ¶

fn: MetricFn[_T]

kwargs `class-attribute` `instance-attribute` ¶

kwargs: Mapping[str, Any] = field(default_factory=dict)

aggregation `class-attribute` `instance-attribute` ¶

aggregation: AggregationFn[_K] | None = None

higher_is_better `class-attribute` `instance-attribute` ¶

higher_is_better: bool = True

output_type `class-attribute` `instance-attribute` ¶

output_type: set[str] = field(default_factory=lambda: {'multiple_choice'})

reduction `class-attribute` `instance-attribute` ¶

reduction: ReductionFn[_T, _K] | None = take_first

Functions¶

__post_init__ ¶

__post_init__()

Source code in lm_eval/api/metrics/metric.py

def __post_init__(self):
    if not self.name:
        raise ValueError("Metric name must be non-empty.")
    output_type = (
        [self.output_type]
        if isinstance(self.output_type, str)
        else self.output_type
    )
    object.__setattr__(self, "output_type", set(output_type))
    if not callable(self.fn):
        raise TypeError(
            f"Metric '{self.name}' fn must be callable, got {type(self.fn)}."
        )
    if self.aggregation is not None and not callable(self.aggregation):
        raise ValueError(
            f"Metric '{self.name}' aggregation must be callable, got {type(self.aggregation)}."
        )
    if self.reduction is None:
        object.__setattr__(self, "reduction", take_first)

from_dict `classmethod` ¶

from_dict(cfg: dict[str, Any] | MetricConfig, output_type: str | None = None) -> Metric[Any, Any]

Source code in lm_eval/api/metrics/metric.py

@classmethod
def from_dict(
    cls, cfg: dict[str, Any] | MetricConfig, output_type: str | None = None
) -> Metric[Any, Any]:
    from lm_eval.config.utils import normalize_metric_cfg

    from . import utils

    return utils.parse_metric(normalize_metric_cfg(cfg), output_type)

compute ¶

compute(*args: Any, **kwargs: Any) -> _T | dict[str, list[_T]]

Compute the metric for a sample.

Source code in lm_eval/api/metrics/metric.py

def compute(self, *args: Any, **kwargs: Any) -> _T | dict[str, list[_T]]:
    """Compute the metric for a sample."""
    return self.fn(*args, **filter_kwargs(self.fn, {**self.kwargs, **kwargs}))

aggregate ¶

aggregate(values: Sequence[_K]) -> float

Aggregate a list of metric values into a single score.

Source code in lm_eval/api/metrics/metric.py

def aggregate(self, values: Sequence[_K]) -> float:
    """Aggregate a list of metric values into a single score."""
    if self.aggregation is None:
        raise ValueError(
            f"Metric {self.name} does not have an aggregation function."
        )
    return self.aggregation(values)

MetricFn ¶

Bases: Protocol[_T]


              flowchart TD
              lm_eval.api.metrics.MetricFn[MetricFn]

              

              click lm_eval.api.metrics.MetricFn href "" "lm_eval.api.metrics.MetricFn"

Callable that computes a per-sample metric value.

Functions¶

call ¶

__call__(references: Any, predictions: Any, **kwargs: Any) -> _T | dict[str, list[_T]]

Source code in lm_eval/api/metrics/_types.py

def __call__(
    self, references: Any, predictions: Any, **kwargs: Any
) -> _T | dict[str, list[_T]]: ...

AggregationFn ¶

Bases: Protocol[_K]


              flowchart TD
              lm_eval.api.metrics.AggregationFn[AggregationFn]

              

              click lm_eval.api.metrics.AggregationFn href "" "lm_eval.api.metrics.AggregationFn"

Callable that aggregates per-document values into a corpus-level float.

Functions¶

call ¶

__call__(values: Sequence[_K]) -> float

Source code in lm_eval/api/metrics/_types.py

def __call__(self, values: Sequence[_K]) -> float: ...

ReductionFn ¶

Bases: Protocol[_T, _K]


              flowchart TD
              lm_eval.api.metrics.ReductionFn[ReductionFn]

              

              click lm_eval.api.metrics.ReductionFn href "" "lm_eval.api.metrics.ReductionFn"

Callable that reduces per-repeat scores into one value per document.

Functions¶

call ¶

__call__(references: Any, predictions: Sequence[_T]) -> _K | _T | dict[str, _K] | dict[str, _T]

Source code in lm_eval/api/metrics/_types.py

def __call__(
    self, references: Any, predictions: Sequence[_T]
) -> _K | _T | dict[str, _K] | dict[str, _T]: ...

CorpusMetric ¶

Bases: ABC, Generic[_R, _T]


              flowchart TD
              lm_eval.api.metrics.CorpusMetric[CorpusMetric]

              

              click lm_eval.api.metrics.CorpusMetric href "" "lm_eval.api.metrics.CorpusMetric"

Base class for corpus-level metrics.

Corpus-level metrics are computed across multiple samples and typically require aggregation of intermediate results.

Data flow

__call__(references, predictions: _R) -> _T # per document intermediate result aggregation(list[_T]) -> float # corpus level

Functions¶

call `abstractmethod` ¶

__call__(references: Any, predictions: _R) -> _T

Compute the per-item metric value for a single document.

Source code in lm_eval/api/metrics/corpus.py

@abstractmethod
def __call__(self, references: Any, predictions: _R) -> _T:
    """Compute the per-item metric value for a single document."""
    ...

aggregation `abstractmethod` ¶

aggregation(items: Sequence[_T]) -> float

Aggregate per-item values into a single corpus-level score.

Source code in lm_eval/api/metrics/corpus.py

@abstractmethod
def aggregation(self, items: Sequence[_T]) -> float:
    """Aggregate per-item values into a single corpus-level score."""
    ...

reduce ¶

reduce(references: Sequence[Any], predictions: Sequence[_T], **kwargs) -> _T

Collapse multiple repeats of a sample into one value. Corpus metrics only support repeat=1.

Source code in lm_eval/api/metrics/corpus.py

def reduce(
    self, references: Sequence[Any], predictions: Sequence[_T], **kwargs
) -> _T:
    """Collapse multiple repeats of a sample into one value. Corpus metrics only support repeat=1."""
    if len(predictions) != 1:
        warning_once(
            eval_logger,
            f"CorpusMetric {self.__class__.__name__} received multiple results; expected only one. Returning the first result.",
        )
    return predictions[0]

LLResults `dataclass` ¶

LLResults(results: list[Any], targets: int | list[int] | str | list[str], ctx: str = '', choices: Sequence[str] = list(), lls_mutual_info: NDArray[float64] = _empty_array(), metadata: dict[str, Any] = dict(), *, lls: NDArray[float64], is_greedy: Sequence[bool])

Per-doc bundle of log-likelihoods, greedy flags, and choices for loglikelihood tasks.

Built via from_instances from all LLInstances sharing a doc_id, and passed as predictions to metrics in LLScorer.

Attributes¶

results `instance-attribute` ¶

results: list[Any]

lls `class-attribute` `instance-attribute` ¶

lls: NDArray[float64] = field(kw_only=True)

is_greedy `class-attribute` `instance-attribute` ¶

is_greedy: Sequence[bool] = field(kw_only=True)

targets `instance-attribute` ¶

targets: int | list[int] | str | list[str]

ctx `class-attribute` `instance-attribute` ¶

ctx: str = ''

choices `class-attribute` `instance-attribute` ¶

choices: Sequence[str] = field(default_factory=list)

lls_mutual_info `class-attribute` `instance-attribute` ¶

lls_mutual_info: NDArray[float64] = field(default_factory=_empty_array)

metadata `class-attribute` `instance-attribute` ¶

metadata: dict[str, Any] = field(default_factory=dict)

Functions¶

char_len ¶

char_len() -> NDArray[float64]

Source code in lm_eval/api/metrics/results.py

def char_len(self) -> NDArray[float64]:
    import numpy as np

    return (
        np.array([float(len(i)) for i in self.choices])
        if self.choices
        else np.ones(len(self.lls))
    )

byte_len ¶

byte_len(count_bytes: Callable[[str], float] = _count_bytes) -> NDArray[int64]

Source code in lm_eval/api/metrics/results.py

def byte_len(
    self, count_bytes: Callable[[str], float] = _count_bytes
) -> NDArray[int64]:
    import numpy as np

    return np.array(
        [count_bytes(i) for i in self.choices]
        if self.choices
        else [1 for _ in range(len(self.lls))],
        dtype=float,
    )

word_len ¶

word_len(count_words: Callable[[str], float] = _count_words) -> NDArray[int64]

Source code in lm_eval/api/metrics/results.py

def word_len(
    self, count_words: Callable[[str], float] = _count_words
) -> NDArray[int64]:
    import numpy as np

    return np.array(
        [count_words(i) for i in self.choices]
        if self.choices
        else [1 for _ in range(len(self.lls))],
        dtype=float,
    )

from_instances `classmethod` ¶

from_instances(results: Sequence[LLInstance], filter_name: str = 'none') -> Self

Source code in lm_eval/api/metrics/results.py

@classmethod
def from_instances(
    cls,
    results: Sequence[LLInstance],
    filter_name: str = "none",
) -> Self:
    from itertools import chain

    import numpy as np

    instances: list[LLInstance] = sorted(
        results,
        key=lambda x: (x.doc_id, x.metadata.get("acc_mutual_info", False)),
    )
    resps, choices, targets, is_mi = zip(
        *(
            (
                inst.filtered_resps[filter_name],
                inst.args[1],
                inst.target,
                inst.metadata.get("acc_mutual_info", False),
            )
            for inst in instances
        ),
        strict=True,
    )

    lls, is_greedy = zip(*chain.from_iterable(resps), strict=True)
    lls = np.array(lls)

    n_cond = sum(not mi for mi in is_mi)
    if n_cond < len(instances):
        assert 2 * n_cond == len(resps), (
            f"Expected 2 * {n_cond} conditional instances == {len(resps)} total instances "
            "for mutual info. Please open an issue on github."
        )
        # per-element choices should be equal
        # Sort puts conditional instances first. Both sets share the same choice order (see MultipleChoiceTask._create_instances).
        assert choices[:n_cond] == choices[n_cond:], (
            "Conditional/unconditional choice order mismatch"
        )
        # Split: conditional 0..n_cond-1, unconditional n_cond..end
        lls, lls_unconditional = lls[:n_cond], lls[n_cond:]
        is_greedy, choices = is_greedy[:n_cond], choices[:n_cond]
        lls_mutual_info = lls - lls_unconditional
    else:
        lls_mutual_info = _empty_array()

    return cls(
        results=list(resps),
        lls=lls,
        is_greedy=is_greedy,
        ctx=instances[0].args[0],
        targets=targets[0],
        choices=choices,
        lls_mutual_info=lls_mutual_info,
    )

to_metric_inputs ¶

to_metric_inputs()

Source code in lm_eval/api/metrics/results.py

def to_metric_inputs(self):
    return {"references": self.targets, "predictions": self}

Metrics¶

Built-in Metrics¶

Loglikelihood Metrics¶

acc ¶

acc_norm ¶

acc_bytes ¶

acc_mutual_info_fn ¶

exact_match_mc ¶

bpb ¶

logprob_fn ¶

brier_score ¶

Generation Metrics¶

exact_match_fn ¶

Aggregation Functions¶

mean ¶

median ¶

nanmean ¶

weighted_mean ¶

perplexity ¶

weighted_perplexity ¶

bits_per_byte ¶

Corpus-Level Metrics¶

Perplexity ¶

Functions¶

__call__ ¶

aggregation ¶

WordPerplexity ¶

Functions¶

__call__ ¶

aggregation ¶

BytePerplexity ¶

Functions¶

__call__ ¶

aggregation ¶

BitsPerByte ¶

Functions¶

__call__ ¶

aggregation ¶

Bleu ¶

Functions¶

aggregation ¶

Chrf ¶

Functions¶

aggregation ¶

Ter ¶

Functions¶

aggregation ¶

F1 ¶

Functions¶

__call__ ¶

aggregation ¶

MCC ¶

Functions¶

__call__ ¶

aggregation ¶

Stderr Functions¶

stderr_for_metric ¶

bootstrap_stderr ¶

mean_stderr ¶

Types¶

Metric dataclass ¶

Attributes¶

name instance-attribute ¶

fn instance-attribute ¶

kwargs class-attribute instance-attribute ¶

aggregation class-attribute instance-attribute ¶

higher_is_better class-attribute instance-attribute ¶

output_type class-attribute instance-attribute ¶

reduction class-attribute instance-attribute ¶

Functions¶

__post_init__ ¶

from_dict classmethod ¶

compute ¶

aggregate ¶

MetricFn ¶

Functions¶

__call__ ¶

AggregationFn ¶

Functions¶

__call__ ¶

call ¶

call ¶

call ¶

call ¶

call ¶

call ¶

Metric `dataclass` ¶

name `instance-attribute` ¶

fn `instance-attribute` ¶

kwargs `class-attribute` `instance-attribute` ¶

aggregation `class-attribute` `instance-attribute` ¶

higher_is_better `class-attribute` `instance-attribute` ¶

output_type `class-attribute` `instance-attribute` ¶

reduction `class-attribute` `instance-attribute` ¶

from_dict `classmethod` ¶

call ¶

call ¶

call ¶

call `abstractmethod` ¶

aggregation `abstractmethod` ¶

LLResults `dataclass` ¶

results `instance-attribute` ¶

lls `class-attribute` `instance-attribute` ¶

is_greedy `class-attribute` `instance-attribute` ¶

targets `instance-attribute` ¶

ctx `class-attribute` `instance-attribute` ¶

choices `class-attribute` `instance-attribute` ¶

lls_mutual_info `class-attribute` `instance-attribute` ¶

metadata `class-attribute` `instance-attribute` ¶

from_instances `classmethod` ¶