feat: double default and max generation budget

Default inference_default_max_tokens 512->1024 in chat-api and in
modal/serve.py default. Hard cap in modal raised 2048->4096. Fixes
mid-sentence cutoffs on longer (esp. thinking-mode) answers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Manmohan Sharma 2026-04-22 22:20:05 -07:00
parent 066791fc03
commit 5bd773ef13
No known key found for this signature in database
2 changed files with 2 additions and 2 deletions

View File

@ -229,7 +229,7 @@ class Inference:
messages = request.get("messages", [])
temperature = min(max(request.get("temperature", 0.8), 0.0), 2.0)
max_tokens = min(max(request.get("max_tokens", 512), 1), 2048)
max_tokens = min(max(request.get("max_tokens", 1024), 1), 4096)
top_k = min(max(request.get("top_k", 50), 0), 200)
force_web_search = bool(request.get("force_web_search", False))

View File

@ -23,7 +23,7 @@ class Settings(BaseSettings):
auth_cache_max_size: int = Field(default=1024)
inference_default_temperature: float = Field(default=0.8)
inference_default_max_tokens: int = Field(default=512)
inference_default_max_tokens: int = Field(default=1024)
inference_default_top_k: int = Field(default=50)
frontend_url: str = Field(default="http://localhost:3000")