refactor(LlamaBatch): replace set_batch with granular add_token + vectorized add_sequence

JamePeng · JamePeng · commit 9262fc04e90c · 2026-02-27T05:23:26.000+08:00
- Introduce high-performance add_token() for single-token append in generation loop
- Add flexible add_sequence() with per-token pos/seq_ids/logits arrays
- Remove old set_batch() that assumed single-seq + forced last logit
- Better support for multi-sequence and precise logit control
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -675,37 +675,82 @@ def reset(self):
         if self.batch is not None:
             self.batch.n_tokens = 0
 
-    def set_batch(self, batch: Sequence[int], n_past: llama_cpp.llama_pos, logits_all: bool):
-        if len(batch) > self.n_tokens_capacity:
-             raise IndexError(f"Input batch size {len(batch)} exceeds capacity {self.n_tokens_capacity}")
+    def add_token(self, token: int, pos: int, seq_ids: Sequence[int], logits: bool):
+        """
+        Adds a single token to the batch.
+        This is a high-performance method for appending a single token during the generation loop,
+        avoiding the overhead of creating temporary lists required by add_sequence.
 
-        n_tokens = len(batch)
-        self.batch.n_tokens = n_tokens
-        for i in range(n_tokens):
-            self.batch.token[i] = batch[i]
-            self.batch.pos[i] = n_past + i
-            self.batch.seq_id[i][0] = 0
-            self.batch.n_seq_id[i] = 1
-            self.batch.logits[i] = logits_all
-        self.batch.logits[n_tokens - 1] = True
-
-    def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
-        n_tokens = len(batch)
+        Args:
+            token: The integer ID of the token to add.
+            pos: The logical sequence position (n_past) of this token.
+            seq_ids: A sequence of sequence IDs this token belongs to (e.g., [0] for a standard single chat).
+                     A single token can be part of multiple sequences simultaneously.
+            logits: A boolean flag indicating whether the backend should compute logits for this token.
+        """
+        idx = self.batch.n_tokens
+        if idx >= self.n_tokens_capacity:
+            raise IndexError(f"LlamaBatch overflow[add_token]: Cannot add token. Capacity {self.n_tokens_capacity} reached.")
+
+        self.batch.token[idx] = token
+        self.batch.pos[idx] = pos
+
+        n_seq_id = len(seq_ids)
+        if n_seq_id > self.n_seq_max:
+            raise ValueError(f"LlamaBatch Error[add_token]: Token belongs to {n_seq_id} sequences, "
+                             f"but n_seq_max was initialized to {self.n_seq_max}.")
+        self.batch.n_seq_id[idx] = n_seq_id
+
+        for i, seq_id in enumerate(seq_ids):
+            self.batch.seq_id[idx][i] = seq_id
+        self.batch.logits[idx] = logits
+
+        self.batch.n_tokens += 1
+
+    def add_sequence(
+        self,
+        token_array: Sequence[int],
+        pos_array: Sequence[int],
+        seq_ids: Sequence[Sequence[int]],
+        logits_array: Sequence[bool]
+    ):
+        """
+        Adds a sequence of tokens to the batch in a vectorized manner.
+        Strictly maps the provided arrays to the underlying C++ batch structure without subjective overriding.
+
+        Args:
+            token_array: A sequence of token IDs to be evaluated.
+            pos_array: A sequence of logical positions corresponding to each token.
+            seq_id_array: A sequence of lists, where each list contains the sequence IDs for the respective token.
+                          (e.g., [[0], [0], [0]] for 3 tokens belonging to sequence 0).
+            logits_array: A sequence of boolean flags indicating whether to compute logits for each token.
+        """
+        n_tokens = len(token_array)
         current_count = self.batch.n_tokens
+
         if current_count + n_tokens > self.n_tokens_capacity:
             raise IndexError(
-                f"LlamaBatch overflow: Cannot add {n_tokens} tokens. "
+                f"LlamaBatch overflow[add_sequence]: Cannot add {n_tokens} tokens. "
                 f"Space left: {self.n_tokens_capacity - current_count}"
             )
-        self.batch.n_tokens += n_tokens
+
+        n_seq_id = len(seq_ids)
+        if n_seq_id > self.n_seq_max:
+            raise ValueError(f"LlamaBatch Error[add_sequence]: Token belongs to {n_seq_id} sequences, "
+                             f"but n_seq_max was initialized to {self.n_seq_max}.")
+
         for i in range(n_tokens):
             j = current_count + i
-            self.batch.token[j] = batch[i]
-            self.batch.pos[j] = i
-            self.batch.seq_id[j][0] = seq_id
-            self.batch.n_seq_id[j] = 1
-            self.batch.logits[j] = logits_all
-        self.batch.logits[current_count + n_tokens - 1] = True
+            self.batch.token[j] = token_array[i]
+            self.batch.pos[j] = pos_array[i]
+
+            self.batch.n_seq_id[j] = n_seq_id
+            for k, seq_id in enumerate(seq_ids):
+                self.batch.seq_id[j][k] = seq_id
+
+            self.batch.logits[j] = logits_array[i]
+
+        self.batch.n_tokens += n_tokens
 
 
 # Embedding functions
diff --git a/llama_cpp/llama_embedding.py b/llama_cpp/llama_embedding.py
@@ -251,8 +251,20 @@ def _decode_batch():
                 _decode_batch()
                 idx_in_batch = 0
 
+            pos_array = list(range(n_tokens))
+
+            if is_none:
+                logits_array = [True] * n_tokens
+            else:
+                logits_array = [False] * (n_tokens - 1) + [True]
+
             # Add to Batch
-            self._batch.add_sequence(tokens, idx_in_batch, logits_all=logits_all)
+            self._batch.add_sequence(
+                token_array=tokens,
+                pos_array=pos_array,
+                seq_ids=[idx_in_batch],
+                logits_array=logits_array
+            )
             batch_seq_lens.append(n_tokens)
             idx_in_batch += 1
 
diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -124,7 +124,7 @@ def test_real_model(llama_cpp_model_path):
 
     for _ in range(4):
         # Prepare batch with current tokens
-        batch.set_batch(curr_tokens, n_past=n_eval, logits_all=False)
+        batch.add_token(curr_tokens, pos=n_eval, seq_ids=[0], logits=False)
 
         # Decode (run inference)
         context.decode(batch)