Adds continuous batching (#850)
Add necessary changes to call generate with CB
Linked PR: https://github.com/huggingface/transformers/pull/38085
This works:
```python
from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.pipeline import Pipeline, PipelineParameters, ParallelismManager
from lighteval.models.endpoints.inference_providers_model import (
InferenceProvidersModelConfig,
)
from lighteval.models.transformers.transformers_model import TransformersModel
import torch
from transformers import AutoModelForCausalLM, GenerationConfig
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
PROVIDER = "hf-inference"
BENCHMARKS = "lighteval|gsm8k|0|0"
evaluation_tracker = EvaluationTracker(output_dir="./results")
pipeline_params = PipelineParameters(
use_chat_template=True, launcher_type=ParallelismManager.NONE, max_samples=None
)
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.2-3b-Instruct", attn_implementation="sdpa_paged", torch_dtype=torch.bfloat16, device_map="auto"
)
# Configure generation parameters
generation_config = GenerationConfig(
max_new_tokens=10,
eos_token_id=model.config.eos_token_id,
pad_token_id=model.config.pad_token_id,
num_blocks=2048,
block_size=256,
)
model.generation_config = generation_config
model = TransformersModel.from_model(model)
pipeline = Pipeline(
model=model,
pipeline_parameters=pipeline_params,
evaluation_tracker=evaluation_tracker,
tasks=BENCHMARKS,
)
pipeline.evaluate()
results = pipeline.get_results()["results"]
print(results)
```
---------
Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>