Skip to content

Task Class

The Task class defines a single evaluation benchmark — how to load data, format prompts, and score responses.

Source

Task

Task(config: TaskConfig | dict[str, Any])

A task represents an entire benchmark, including its dataset, problems, answers, and evaluation methods.

See BoolQ for a simple example implementation.

A doc can be any python object that represents one instance of evaluation. This is usually a dictionary e.g.

Source code in lm_eval/api/task/_task.py
def __init__(self, config: TaskConfig | dict[str, Any]):
    self._config: TaskConfig = (
        config if isinstance(config, TaskConfig) else TaskConfig(**config)
    )
    self.task = self._config.task
    self.VERSION = self.config.metadata.get("version", self.VERSION)
    assert self.task is not None
    self.OUTPUT_TYPE = self._config.output_type or self.OUTPUT_TYPE or None
    assert self.OUTPUT_TYPE, "output_type must be set in TaskConfig or subclass"
    self._dataset_name = self._config.dataset_name or self.DATASET_NAME
    self._dataset_path = self._config.dataset_path or self.DATASET_PATH
    self._fewshot_cfg: FewshotConfig = cast(
        "FewshotConfig", self._config.fewshot_config
    )  # normalized by

    self._multiple_inputs = self._config.multiple_inputs
    self._multiple_targets = self.config.multiple_targets
    self._multimodal = (
        bool(self.config.doc_to_audio or self.config.doc_to_image)
        or self.MULTIMODAL
    )

    # lazy load dataset
    self._dataset: Dataset | None = None
    # resolve sampler class, does not need dataset access
    self._sampler_cls: type[samplers.ContextSampler] = self._resolve_sampler_cls()
    # fewshot seed is None by default, so sampler will use random seed.
    self._fewshot_seed: int | None = None
    self._instances = None

    self._scorers: list[Scorer] = self._build_scorers()

Attributes

VERSION class-attribute instance-attribute

VERSION: str = get('version', VERSION)

OUTPUT_TYPE class-attribute instance-attribute

OUTPUT_TYPE: OutputType | Literal['multiple_choice'] | None = output_type or OUTPUT_TYPE or None

DATASET_PATH class-attribute instance-attribute

DATASET_PATH: str | None = None

DATASET_NAME class-attribute instance-attribute

DATASET_NAME: str | None = None

MULTIMODAL class-attribute instance-attribute

MULTIMODAL: bool = False

task instance-attribute

task = task

sampler cached property

sampler: ContextSampler

Lazily create the fewshot sampler (triggers dataset download on first access).

dataset property writable

dataset: Dataset

Lazily load and return the dataset.

config property

config: TaskConfig

Returns the TaskConfig associated with this class.

eval_docs property

eval_docs: DataSplit

instances property

instances: list[Instance]

Dataset instances which will be evaluated.

Populated after calling task.build_all_requests().

is_multimodal cached property

is_multimodal

scorers property

scorers: list[Scorer]

Public access to the scorer pipeline.

task_name property

task_name: str

id cached property

id

Functions

from_config classmethod

from_config(config: TaskConfig | dict[str, Any])

Factory method to create the appropriate Task subclass based on output_type.

PARAMETER DESCRIPTION
config

TaskConfig instance or dict with task configuration

TYPE: TaskConfig | dict[str, Any]

RETURNS DESCRIPTION

Instance of the appropriate Task subclass (GenerateTask, MultipleChoiceTask, etc.)

Source code in lm_eval/api/task/_task.py
@classmethod
def from_config(cls, config: TaskConfig | dict[str, Any]):
    """Factory method to create the appropriate Task subclass based on output_type.

    Args:
        config: TaskConfig instance or dict with task configuration

    Returns:
        Instance of the appropriate Task subclass (GenerateTask, MultipleChoiceTask, etc.)
    """
    # Normalize to TaskConfig if needed
    if isinstance(config, dict):
        config = TaskConfig(**config)  # type:ignore[invalid-argument-type]

    # Look up the appropriate Task class
    output_type = config.output_type
    if output_type not in cls._registry:
        raise ValueError(
            f"No Task class registered for output_type '{output_type}'. "
            f"Available types: {sorted(cls._registry.keys())}"
        )

    # Instantiate and return the appropriate subclass
    task_class = cls._registry[output_type]
    return task_class(config)

count_bytes staticmethod

count_bytes(doc)

Used for byte-level perplexity metrics in rolling loglikelihood.

Source code in lm_eval/api/task/_task.py
@staticmethod
def count_bytes(doc):
    """Used for byte-level perplexity metrics in rolling loglikelihood."""
    return len(doc.encode("utf-8"))

count_words staticmethod

count_words(doc)

Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!

Source code in lm_eval/api/task/_task.py
@staticmethod
def count_words(doc):
    """Downstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!"""
    return len(re.split(r"\s+", doc))

download

download(dataset_kwargs: dict[str, Any] | None = None, **kwargs) -> None
Source code in lm_eval/api/task/_task.py
def download(self, dataset_kwargs: dict[str, Any] | None = None, **kwargs) -> None:
    import datasets
    from packaging.version import parse as vparse

    if dataset_kwargs and vparse(datasets.__version__) >= vparse("4.0.0"):
        dataset_kwargs.pop("trust_remote_code", None)

    self._config.dataset_kwargs, self._config.metadata = (
        self._config.dataset_kwargs or {},
        self._config.metadata or {},
    )

    if callable(df := self._config.custom_dataset):
        eval_logger.warning(
            "%s: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager."
            "\nFor example --metadata='{\"max_seq_lengths\":[4096, 8192]}'. For details see task Readme.",
            self._config.task,
        )
        self._dataset = df(**(self._config.dataset_kwargs | self._config.metadata))
    else:
        assert self._dataset_path is not None, (
            "dataset_path must be set in TaskConfig or class attribute"
        )
        splits = [
            self.config.training_split,
            self.config.validation_split,
            self.config.test_split,
            self.config.fewshot_split,
        ]
        self._dataset = load_dataset_splits(
            path=self._dataset_path,
            name=self._dataset_name,
            split=[x for x in splits if x is not None],
            **self.config.dataset_kwargs,
        )

has_training_docs

has_training_docs() -> bool
Source code in lm_eval/api/task/_task.py
def has_training_docs(self) -> bool:
    return self.config.training_split is not None

has_validation_docs

has_validation_docs() -> bool
Source code in lm_eval/api/task/_task.py
def has_validation_docs(self) -> bool:
    return self.config.validation_split is not None

has_test_docs

has_test_docs() -> bool
Source code in lm_eval/api/task/_task.py
def has_test_docs(self) -> bool:
    return self.config.test_split is not None

training_docs

training_docs() -> DataSplit | None
Source code in lm_eval/api/task/_task.py
def training_docs(self) -> DataSplit | None:
    return self._get_split_docs(self.config.training_split)

validation_docs

validation_docs() -> DataSplit | None
Source code in lm_eval/api/task/_task.py
def validation_docs(self) -> DataSplit | None:
    return self._get_split_docs(self.config.validation_split)

test_docs

test_docs() -> DataSplit | None
Source code in lm_eval/api/task/_task.py
def test_docs(self) -> DataSplit | None:
    return self._get_split_docs(self.config.test_split)

fewshot_docs

fewshot_docs() -> DataSplit | None
Source code in lm_eval/api/task/_task.py
def fewshot_docs(self) -> DataSplit | None:
    if (_df := self._fewshot_cfg.get_docs(self.dataset)) is not None:
        self._fewshot_docs = list(_df)
        return _df

    if (_shots := self._config.num_fewshot) is not None and _shots > 0:
        eval_logger.warning(
            "[Task: %s] num_fewshot > 0 but fewshot_split is None. "
            "using preconfigured rule.",
            self._config.task,
        )
        # Try splits in priority order
        _df = self.training_docs() or self.validation_docs()
        if _df is not None:
            self._fewshot_docs = list(_df)
            return self._fewshot_docs

        # Fallback to test split
        eval_logger.warning(
            "[Task: %s] has_training_docs and has_validation_docs are False"
            ", using test_docs as fewshot_docs but this is not recommended.",
            self._config.task,
        )
        if (_df := self.test_docs()) is not None:
            self._fewshot_docs = list(_df)
            return self._fewshot_docs

        self._fewshot_docs = []
        return self._fewshot_docs

get_docs

get_docs(subset: str) -> DataSplit | None
Source code in lm_eval/api/task/_task.py
def get_docs(self, subset: str) -> DataSplit | None:
    split = getattr(self.config, subset, None)
    return self._get_split_docs(split)

doc_iterator

doc_iterator(*, rank: int = 0, limit: int | None = None, world_size: int = 1, samples: Sequence[int] | None = None) -> Iterator[tuple[int, Any]]
Source code in lm_eval/api/task/_task.py
def doc_iterator(
    self,
    *,
    rank: int = 0,
    limit: int | None = None,
    world_size: int = 1,
    samples: Sequence[int] | None = None,
) -> Iterator[tuple[int, Any]]:
    if samples:
        n = len(self.eval_docs)
        assert all(e < n for e in samples), (
            f"Elements of --samples should be in the interval [0,k-1] where k is the number of total examples. In this case, k={n}."
        )
        eval_logger.info(
            "%s: Evaluating on %s examples",
            self.config.task,
            len(samples),
        )
        sample_set = set(samples)
        return utils.create_iterator(
            ((i, x) for i, x in enumerate(self.eval_docs) if i in sample_set),
            rank=int(rank),
            limit=None,
            world_size=int(world_size),
        )

    limit = int(limit) if limit else None
    return utils.create_iterator(
        enumerate(self.eval_docs),
        rank=int(rank),
        limit=limit,
        world_size=int(world_size),
    )

fewshot_context

fewshot_context(doc: dict, num_fewshot: int, system_instruction: str | None = None, apply_chat_template: bool = False, fewshot_as_multiturn: bool = False, chat_template: ChatTemplate | None = None, gen_prefix: str | None = None) -> str | list[str]

Build the full prompt context including system prompt, few-shot examples, and eval doc.

Constructs a complete prompt by: 1. Adding system instruction + task description (if provided) 2. Adding num_fewshot labeled examples from the fewshot split 3. Adding the evaluation document (without its answer)

Each component is built using build_qa_turn() and can be rendered as plain text or formatted via a chat template.

PARAMETER DESCRIPTION
doc

The evaluation document to build context for.

TYPE: dict

num_fewshot

Number of few-shot examples to include.

TYPE: int

system_instruction

System instruction to prepend to the prompt.

TYPE: str | None DEFAULT: None

apply_chat_template

If True, format output using the chat template.

TYPE: bool DEFAULT: False

fewshot_as_multiturn

If True, keep few-shot examples as separate user/assistant turns. If False, collapse into a single user message.

TYPE: bool DEFAULT: False

chat_template

Renders a list of message dicts to a string.

TYPE: Callable | None DEFAULT: None

gen_prefix

Prefix to start the assistant's response (e.g., "Answer:").

TYPE: str | None DEFAULT: None

RETURNS DESCRIPTION
str | list[str]

str | list[str]: The formatted prompt string, or a list of strings for multiple-input tasks (e.g., Winogrande where each choice becomes a separate context).

Source code in lm_eval/api/task/_task.py
def fewshot_context(
    self,
    doc: dict,
    num_fewshot: int,
    system_instruction: str | None = None,
    apply_chat_template: bool = False,
    fewshot_as_multiturn: bool = False,
    chat_template: ChatTemplate | None = None,
    gen_prefix: str | None = None,
) -> str | list[str]:
    """Build the full prompt context including system prompt, few-shot examples, and eval doc.

    Constructs a complete prompt by:
    1. Adding system instruction + task description (if provided)
    2. Adding `num_fewshot` labeled examples from the fewshot split
    3. Adding the evaluation document (without its answer)

    Each component is built using `build_qa_turn()` and can be rendered as plain
    text or formatted via a chat template.

    Args:
        doc (dict): The evaluation document to build context for.
        num_fewshot (int): Number of few-shot examples to include.
        system_instruction (str | None): System instruction to prepend to the prompt.
        apply_chat_template (bool): If True, format output using the chat template.
        fewshot_as_multiturn (bool): If True, keep few-shot examples as separate
            user/assistant turns. If False, collapse into a single user message.
        chat_template (Callable | None): Renders a list of message dicts to a string.
        gen_prefix (str | None): Prefix to start the assistant's response (e.g., "Answer:").

    Returns:
        str | list[str]: The formatted prompt string, or a list of strings for
            multiple-input tasks (e.g., Winogrande where each choice becomes a
            separate context).
    """
    messages: list[Message] = []
    chat_template = (
        partial(chat_template, add_generation_prompt=not gen_prefix)
        if chat_template
        else None
    )
    description = self._resolve_field(doc, self.config.description) or ""
    system_prompt = maybe_delimit(
        system_instruction, description, self.config.fewshot_delimiter
    )
    if system_prompt:
        messages.append(Message("system", system_prompt))

    if num_fewshot > 0:
        for fs_doc in self.sampler.sample(
            n=num_fewshot,
            eval_doc=doc
            if self._fewshot_cfg.split == self.config.test_split
            else None,
        ):
            q, c, a = (
                self.doc_to_text(fs_doc, self._fewshot_cfg.doc_to_text),
                self.doc_to_choice(fs_doc, self._fewshot_cfg.doc_to_choice)
                if self._fewshot_cfg.doc_to_choice
                else None,
                self.doc_to_target(fs_doc, self._fewshot_cfg.doc_to_target),
            )
            # in most cases we expect q to be a string, except for multiple-input:
            # q: list[str], c: list[str] len 1, a: int index into q
            if isinstance(q, list):
                assert isinstance(a, int), (
                    "Multiple-input fewshot examples require integer answer keys to index into the question list"
                )
                q = q[a]
                a = 0  # choices are a list of len 1.
            _gen_prefix = self._resolve_field(doc, self._fewshot_cfg.gen_prefix)
            messages += self._build_qa_turn(
                q=q,
                c=c,
                a=a,
                gen_prefix=_gen_prefix,
                tgt_delim=self._fewshot_cfg.target_delimiter,
                few_delim=self._fewshot_cfg.fewshot_delimiter,
            )

    q, c, a = (
        self.doc_to_text(doc),
        self.doc_to_choice(doc) if self.config.doc_to_choice else None,
        self.doc_to_target(doc),
    )
    if self._multiple_inputs:
        assert isinstance(q, list), "multiple inputs require choices to be a list"
        return self._multiple_input_context(
            messages,
            gen_prefix,
            q,
            chat_template=chat_template if apply_chat_template else None,
            fewshot_as_multiturn=fewshot_as_multiturn,
        )
    assert isinstance(q, str), (
        f"Expected doc_to_text to be a string, got {type(q)}: {q}"
    )
    messages += self._build_qa_turn(
        q=q,
        c=c,
        gen_prefix=gen_prefix,
        # fewshot delimiter used to separate q and gen_prefix
        tgt_delim=self.config.target_delimiter,
        few_delim="",
    )
    if apply_chat_template and chat_template:
        res = (
            [m.to_dict() for m in messages]
            if fewshot_as_multiturn
            else multiturn_to_singleturn(messages)
        )
        res: list[dict[str, str]] | str = chat_template(res)
    else:
        res: str = "".join(m.to_text() for m in messages)

    return res

construct_requests abstractmethod

construct_requests(doc: dict[str, Any], ctx: str | Sequence[str] | list[dict[str, Any]], *, doc_id: int, metadata: dict[str, Any] | None = None, apply_chat_template: bool = False, chat_template: ChatTemplate | None = None, **kwargs) -> list[GenInstance] | list[LLInstance] | None

Convert a doc and its prompt context into Instance objects for the LM.

Called by build_all_requests after fewshot_context has produced the prompt. Each subclass maps the prompt into the request format its output type requires (loglikelihood pairs, generation args, etc.).

PARAMETER DESCRIPTION
doc

The evaluation document from the dataset split.

TYPE: dict[str, Any]

ctx

The prompt produced by fewshot_context. Shape depends on rendering mode: - str: plain-text prompt - list[str]: one prompt per input (multiple-input tasks) - list[dict]: chat-formatted message list

TYPE: str | Sequence[str] | list[dict[str, Any]]

doc_id

Index of the document within the evaluation split.

TYPE: int

metadata

Per-instance metadata forwarded to the Instance.

TYPE: dict[str, Any] | None DEFAULT: None

apply_chat_template

Whether a chat template was applied.

TYPE: bool DEFAULT: False

chat_template

The chat template callable, if any.

TYPE: ChatTemplate | None DEFAULT: None

RETURNS DESCRIPTION
list[GenInstance] | list[LLInstance] | None

A list of Instances to send to the LM, or None to skip this doc.

Source code in lm_eval/api/task/_task.py
@abc.abstractmethod
def construct_requests(
    self,
    doc: dict[str, Any],
    ctx: str | Sequence[str] | list[dict[str, Any]],
    *,
    doc_id: int,
    metadata: dict[str, Any] | None = None,
    apply_chat_template: bool = False,
    chat_template: ChatTemplate | None = None,
    **kwargs,
) -> list[GenInstance] | list[LLInstance] | None:
    """Convert a doc and its prompt context into Instance objects for the LM.

    Called by ``build_all_requests`` after ``fewshot_context`` has produced
    the prompt. Each subclass maps the prompt into the request format its
    output type requires (loglikelihood pairs, generation args, etc.).

    Args:
        doc: The evaluation document from the dataset split.
        ctx: The prompt produced by ``fewshot_context``. Shape depends on
            rendering mode:
            - str: plain-text prompt
            - list[str]: one prompt per input (multiple-input tasks)
            - list[dict]: chat-formatted message list
        doc_id: Index of the document within the evaluation split.
        metadata: Per-instance metadata forwarded to the Instance.
        apply_chat_template: Whether a chat template was applied.
        chat_template: The chat template callable, if any.

    Returns:
        A list of Instances to send to the LM, or None to skip this doc.
    """
    ...

build_all_requests

build_all_requests(*, limit: int | None = None, samples: Sequence[int] | None = None, rank: int = 0, world_size: int = 1, cache_requests: bool = False, rewrite_requests_cache: bool = False, system_instruction: str | None = None, apply_chat_template: bool = False, fewshot_as_multiturn: bool = False, chat_template: ChatTemplate | None = None, tokenizer_name: str = '') -> list[Instance]

Build all Instance objects for this task and store them in self._instances.

For each document in the evaluation split this method: 1. Builds the prompt via fewshot_context. 2. Converts it to Instance(s) via construct_requests. 3. Optionally loads/saves results from a request cache.

PARAMETER DESCRIPTION
limit

Maximum number of documents to evaluate (None = all).

TYPE: int | None DEFAULT: None

samples

Explicit list of document indices to evaluate.

TYPE: Sequence[int] | None DEFAULT: None

rank

Worker rank for distributed evaluation.

TYPE: int DEFAULT: 0

world_size

Total number of workers.

TYPE: int DEFAULT: 1

cache_requests

Whether to load/save instances from cache.

TYPE: bool DEFAULT: False

rewrite_requests_cache

Force-rebuild the cache even if it exists.

TYPE: bool DEFAULT: False

system_instruction

System prompt prepended to every context.

TYPE: str | None DEFAULT: None

apply_chat_template

Whether to render prompts through a chat template.

TYPE: bool DEFAULT: False

fewshot_as_multiturn

Keep few-shot examples as separate chat turns instead of collapsing them into a single user message.

TYPE: bool DEFAULT: False

chat_template

The chat template callable.

TYPE: ChatTemplate | None DEFAULT: None

tokenizer_name

Included in the cache key to distinguish tokenizers.

TYPE: str DEFAULT: ''

RETURNS DESCRIPTION
list[Instance]

Flat list of Instances, also stored in self._instances.

Source code in lm_eval/api/task/_task.py
def build_all_requests(
    self,
    *,
    limit: int | None = None,
    samples: Sequence[int] | None = None,
    rank: int = 0,
    world_size: int = 1,
    cache_requests: bool = False,
    rewrite_requests_cache: bool = False,
    system_instruction: str | None = None,
    apply_chat_template: bool = False,
    fewshot_as_multiturn: bool = False,
    chat_template: ChatTemplate | None = None,
    tokenizer_name: str = "",
) -> list[Instance]:
    """Build all Instance objects for this task and store them in ``self._instances``.

    For each document in the evaluation split this method:
    1. Builds the prompt via ``fewshot_context``.
    2. Converts it to Instance(s) via ``construct_requests``.
    3. Optionally loads/saves results from a request cache.

    Args:
        limit: Maximum number of documents to evaluate (None = all).
        samples: Explicit list of document indices to evaluate.
        rank: Worker rank for distributed evaluation.
        world_size: Total number of workers.
        cache_requests: Whether to load/save instances from cache.
        rewrite_requests_cache: Force-rebuild the cache even if it exists.
        system_instruction: System prompt prepended to every context.
        apply_chat_template: Whether to render prompts through a chat template.
        fewshot_as_multiturn: Keep few-shot examples as separate chat turns
            instead of collapsing them into a single user message.
        chat_template: The chat template callable.
        tokenizer_name: Included in the cache key to distinguish tokenizers.

    Returns:
        Flat list of Instances, also stored in ``self._instances``.
    """
    cache_key = _build_cache_key(
        self._config.task,
        self.config.num_fewshot,
        rank,
        world_size,
        apply_chat_template,
        fewshot_as_multiturn,
        system_instruction,
        tokenizer_name,
    )
    cached = load_from_cache(file_name=cache_key, cache=cache_requests)

    if cache_requests and cached and not rewrite_requests_cache:
        grouped = cached[:limit]
    else:
        # When caching a miss/rewrite, build ALL docs so the cache is
        # complete; then slice to the requested limit afterwards.
        should_build_all = (
            cache_requests
            and (not cached or rewrite_requests_cache)
            and limit is not None
        )
        build_limit = None if should_build_all else limit

        eval_logger.debug(
            "Building contexts for %s on rank %s...",
            self.config.task,
            rank,
        )

        grouped: list[list[Instance]] = []
        doc_id_docs = list(
            self.doc_iterator(
                rank=rank, limit=build_limit, samples=samples, world_size=world_size
            )
        )

        for doc_id, doc in tqdm(doc_id_docs, total=len(doc_id_docs), delay=5):
            fewshot_ctx = self.fewshot_context(
                doc,
                num_fewshot=0
                if self.config.num_fewshot is None
                else max(0, self.config.num_fewshot),
                system_instruction=system_instruction,
                apply_chat_template=apply_chat_template,
                fewshot_as_multiturn=fewshot_as_multiturn,
                chat_template=chat_template,
                gen_prefix=self._resolve_field(doc, self.config.gen_prefix),
            )

            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
            inst = self.construct_requests(
                doc=doc,
                ctx=fewshot_ctx,
                doc_id=doc_id,
                apply_chat_template=apply_chat_template,
                chat_template=chat_template,
            )
            if inst is None:
                eval_logger.info("Skipping doc_id=%s.", doc_id)
                continue
            if not isinstance(inst, list):
                inst = [inst]

            grouped.append(inst)

        if cache_requests and (not cached or rewrite_requests_cache):
            save_to_cache(file_name=cache_key, obj=grouped)

        grouped = grouped[:limit]

    flattened = [inst for group in grouped for inst in group]

    if not flattened:
        raise ValueError("task.build_requests() did not find any docs!")

    self._instances = flattened
    return flattened

doc_to_text

doc_to_text(doc: Doc, doc_to_text: Callable[[Doc], str | list[str]] | str | None = None) -> str | list[str] | None
Source code in lm_eval/api/task/_task.py
def doc_to_text(
    self,
    doc: Doc,
    doc_to_text: Callable[[Doc], str | list[str]] | str | None = None,
) -> str | list[str] | None:
    doc_to_text = (
        doc_to_text if doc_to_text is not None else self.config.doc_to_text
    )
    return process_field(doc, doc_to_text)

doc_to_choice

doc_to_choice(doc: Doc, doc_to_choice: Callable[[Doc], list[str]] | str | list[str] | None = None) -> list[str] | None
Source code in lm_eval/api/task/_task.py
def doc_to_choice(
    self,
    doc: Doc,
    doc_to_choice: Callable[[Doc], list[str]] | str | list[str] | None = None,
) -> list[str] | None:
    doc_to_choice = (
        doc_to_choice if doc_to_choice is not None else self.config.doc_to_choice
    )
    choices = _coerce_list(process_field(doc, doc_to_choice))
    if choices is not None and not isinstance(choices, list):
        eval_logger.warning(
            "doc_to_choice must return a list, got %s: %r. Skipping ...",
            type(choices).__name__,
            choices,
        )
        return None
    return choices

doc_to_target

doc_to_target(doc: Doc, doc_to_target: Callable[[Doc], str | int | list[int] | list[str]] | str | None = None) -> str | int | list[str] | list[int] | None
Source code in lm_eval/api/task/_task.py
def doc_to_target(
    self,
    doc: Doc,
    doc_to_target: Callable[[Doc], str | int | list[int] | list[str]]
    | str
    | None = None,
) -> str | int | list[str] | list[int] | None:
    doc_to_target = (
        doc_to_target if doc_to_target is not None else self.config.doc_to_target
    )
    y = process_field(doc, doc_to_target)
    return _coerce_target(y, parse_list=self._multiple_targets is True)

doc_to_image

doc_to_image(doc: Any, doc_to_image=None) -> int | str | list | None
Source code in lm_eval/api/task/_task.py
def doc_to_image(self, doc: Any, doc_to_image=None) -> int | str | list | None:
    return process_field(doc, doc_to_image or self.config.doc_to_image)

doc_to_audio

doc_to_audio(doc: Any, doc_to_audio=None) -> int | str | list | None
Source code in lm_eval/api/task/_task.py
def doc_to_audio(self, doc: Any, doc_to_audio=None) -> int | str | list | None:
    return process_field(doc, doc_to_audio or self.config.doc_to_audio)

apply_filters

apply_filters() -> None

Apply filter ensembles from each scorer to instances.

Source code in lm_eval/api/task/_task.py
def apply_filters(self) -> None:
    """Apply filter ensembles from each scorer to instances."""
    if not self._instances:
        return
    for scorer in self._scorers:
        scorer.apply_filter(self._instances)

process_instances

process_instances() -> None

Apply filters, score instances, reduce — all stored on Scorers.

For each scorer, tries the legacy process_results path first (YAML !function or Python subclass override). Falls through to scorer.score_instances() only when process_results returns None.

Source code in lm_eval/api/task/_task.py
def process_instances(self) -> None:
    """Apply filters, score instances, reduce — all stored on Scorers.

    For each scorer, tries the legacy ``process_results`` path first
    (YAML ``!function`` or Python subclass override).  Falls through to
    ``scorer.score_instances()`` only when ``process_results`` returns
    ``None``.
    """
    if not self._scorers:
        return

    self.apply_filters()

    instances = group_by_doc_id(self._instances)

    for scorer in self._scorers:
        pr_results = self._try_process_results(instances, scorer.name)
        if pr_results is not None:
            scorer.set_results(pr_results)
        else:
            scored_docs = scorer.score_instances(instances)
            scorer.set_results(scored_docs)

process_results

process_results(doc: dict[str, Any], results: Sequence[LLOutput] | Sequence[Completion]) -> dict[str, list[Any]] | None
Source code in lm_eval/api/task/_task.py
def process_results(
    self,
    doc: dict[str, Any],
    results: Sequence[LLOutput] | Sequence[Completion],
) -> dict[str, list[Any]] | None:
    if callable(self.config.process_results):
        return self.config.process_results(doc, results)
    return None

aggregate

aggregate(bootstrap_iters: int | None = 100000) -> tuple[dict[str, Any], int]

Aggregate all scorers' reduced results.

Returns (agg_dict, sample_len) where agg_dict has "metric,scorer" string keys. This is the only place where string keys are produced.

Legacy Python tasks that override aggregation() get their custom functions forwarded to each scorer so that corpus-level metrics (e.g. SQuAD v2, SCROLLS) are aggregated correctly instead of falling back to mean.

Source code in lm_eval/api/task/_task.py
def aggregate(
    self, bootstrap_iters: int | None = 100000
) -> tuple[dict[str, Any], int]:
    """Aggregate all scorers' reduced results.

    Returns (agg_dict, sample_len) where agg_dict has "metric,scorer" string keys.
    This is the only place where string keys are produced.

    Legacy Python tasks that override ``aggregation()`` get their custom
    functions forwarded to each scorer so that corpus-level metrics
    (e.g. SQuAD v2, SCROLLS) are aggregated correctly instead of
    falling back to ``mean``.
    """
    # Detect subclass override of aggregation()
    custom_agg = (
        self.aggregation()
        if type(self).aggregation is not Task.aggregation
        else None
    )

    agg_metrics: dict[str, Any] = {}
    sample_len = 0
    for scorer in self._scorers:
        result, count = scorer.aggregate(
            scorer.reduced_docs,
            bootstrap_iters=bootstrap_iters,
            aggregation_overrides=custom_agg,
        )
        agg_metrics.update(result)
        sample_len = max(sample_len, count)
    return agg_metrics, sample_len

aggregation

aggregation() -> dict[str, Callable[[list[Any]], Any]]
Source code in lm_eval/api/task/_task.py
def aggregation(self) -> dict[str, Callable[[list[Any]], Any]]:
    return {
        m.name: m.aggregation
        for scorer in self._scorers
        for m in (scorer.metrics or [])
        if m.aggregation
    }

higher_is_better

higher_is_better() -> dict[str, bool]
Source code in lm_eval/api/task/_task.py
def higher_is_better(self) -> dict[str, bool]:
    return {
        k: v for scorer in self._scorers for k, v in scorer.higher_is_better.items()
    }

get_config

get_config(key: str) -> Any
Source code in lm_eval/api/task/_task.py
def get_config(self, key: str) -> Any:
    return getattr(self._config, key, None)

set_config

set_config(key: str, value: Any, update: bool = False) -> None

Set or update the configuration for a given key.

Source code in lm_eval/api/task/_task.py
def set_config(self, key: str, value: Any, update: bool = False) -> None:
    """Set or update the configuration for a given key."""
    if key is None:
        raise ValueError("Key must be provided.")

    if update:
        current_value = getattr(self._config, key, {})
        if not isinstance(current_value, dict):
            raise TypeError(
                f"Expected a dict for key '{key}', got {type(current_value).__name__} instead."
            )
        current_value.update(value)
    else:
        setattr(self._config, key, value)

override_metric

override_metric(metric_name: str) -> None

Override the default metrics with a single named metric.

Rebuilds the scorer pipeline so that only metric_name is computed. Used by the evaluator for predict_only mode (metric="bypass").

Source code in lm_eval/api/task/_task.py
def override_metric(self, metric_name: str) -> None:
    """Override the default metrics with a single named metric.

    Rebuilds the scorer pipeline so that only *metric_name* is computed.
    Used by the evaluator for ``predict_only`` mode (metric="bypass").
    """
    self._scorers = [
        build_scorer(
            cfg={
                "name": "none",
                "filter": [],
                "metric_list": [{"metric": metric_name}],
            },
            output_type=self.OUTPUT_TYPE,
        )
    ]

set_repeats

set_repeats(repeats: int) -> None

Override the default number of repeats this task.

Source code in lm_eval/api/task/_task.py
def set_repeats(self, repeats: int) -> None:
    """Override the default number of repeats this task."""
    eval_logger.debug(
        "[%s] Overwriting default repeats from %s to %s",
        self.task_name,
        self.config.repeats,
        repeats,
    )
    self._config.repeats = int(repeats)

set_num_fewshot

set_num_fewshot(num_fewshot: int) -> None

Override the default number of fewshot examples for this task.

Source code in lm_eval/api/task/_task.py
def set_num_fewshot(self, num_fewshot: int) -> None:
    """Override the default number of fewshot examples for this task."""
    if self.config.num_fewshot == 0:
        eval_logger.info(
            "[%s] num_fewshot has been set to 0 in its config. Manual configuration will be ignored.",
            self.task_name,
        )
        return
    eval_logger.debug(
        "[%s] Overwriting default num_fewshot from %s to %s",
        self.task_name,
        self.config.num_fewshot or 0,
        num_fewshot,
    )
    self._config.num_fewshot = int(num_fewshot)

set_fewshot_seed

set_fewshot_seed(seed: int | None = None) -> None
Source code in lm_eval/api/task/_task.py
def set_fewshot_seed(self, seed: int | None = None) -> None:
    self.fewshot_rnd = random.Random(seed)
    self._fewshot_seed = seed
    # If sampler already materialized, update it directly
    if "sampler" in self.__dict__:
        self.sampler.set_rnd(seed)

dump_config

dump_config() -> dict

Returns the config as a dictionary.

Source code in lm_eval/api/task/_task.py
def dump_config(self) -> dict:
    """Returns the config as a dictionary."""
    return self.config.to_dict()

process_doc staticmethod

process_doc(doc: dict, fn: Callable) -> dict

Process (detokenize, strip, replace, etc.) an individual document.

Override this to transform documents. Can be used in a map over a data split, e.g. map(self._process_doc, self.dataset["validation"]).

RETURNS DESCRIPTION
dict

The processed version of the specified doc.

Source code in lm_eval/api/task/_task.py
@staticmethod
def process_doc(doc: dict, fn: Callable) -> dict:
    """Process (detokenize, strip, replace, etc.) an individual document.

    Override this to transform documents. Can be used in a map over a data split,
    e.g. ``map(self._process_doc, self.dataset["validation"])``.

    Returns:
        The processed version of the specified `doc`.
    """
    return doc