Skip to content
Merged
54 changes: 44 additions & 10 deletions lightllm/server/api_anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ def get_anthropic_messages_adapter() -> Any:
except ImportError as exc:
raise RuntimeError(
"The Anthropic Messages API (/v1/messages) requires the 'litellm' package. "
"Install it with: pip install 'litellm>=1.52.0,<1.85'. "
"Install it with: pip install 'lightllm[anthropic]' "
"(or directly: pip install 'litellm>=1.52.0,<1.85'). "
f"Original error: {exc}"
) from exc

Expand Down Expand Up @@ -76,10 +77,23 @@ def _anthropic_to_chat_request(anthropic_body: Dict[str, Any]) -> Tuple[Dict[str
if "max_tokens" in anthropic_body:
openai_dict["max_tokens"] = anthropic_body["max_tokens"]

# Forward LightLLM-specific fields nested under ``extra_body`` (OpenAI SDK
# convention) so clients hitting /v1/messages can reach ChatCompletionRequest
# options Anthropic's own schema does not expose — notably chat_template_kwargs
# for models with optional thinking modes (Qwen3, DeepSeek). Fields already
# produced by the Anthropic->OpenAI translation take precedence; unknown keys
# are silently dropped by Pydantic (extra='ignore').
extra_body = anthropic_body.get("extra_body")
if isinstance(extra_body, dict):
for k, v in extra_body.items():
openai_dict.setdefault(k, v)

_UNKNOWN_FIELDS = {"extra_body", "metadata", "anthropic_version", "cache_control"}
for key in list(openai_dict.keys()):
if key in _UNKNOWN_FIELDS:
openai_dict.pop(key, None)
dropped = [k for k in anthropic_body if k in _UNKNOWN_FIELDS]
if dropped:
logger.debug("Dropping Anthropic-only fields not forwarded to chat pipeline: %s", dropped)
for key in dropped:
openai_dict.pop(key, None)

return openai_dict, tool_name_mapping

Expand Down Expand Up @@ -409,12 +423,32 @@ async def _openai_sse_to_anthropic_events(
)
state["buffered_args"] = ""
else:
# Already started. If deltas for a different block are
# now arriving (unusual interleaving), close whatever's
# currently open and reopen... but in practice OpenAI
# streams tool_calls sequentially per index, so the
# current_open is this same block.
# Already started. A delta for this tool-call index may
# arrive after a later tool-call has opened its own block.
# Anthropic's protocol forbids emitting deltas against a
# non-open index, so close whatever is currently open and
# reopen THIS block before emitting.
if new_args:
if current_open is None or current_open != ("tool_use", state["anthropic_index"]):
if current_open is not None:
yield _sse_event(
"content_block_stop",
{"type": "content_block_stop", "index": current_open[1]},
)
current_open = ("tool_use", state["anthropic_index"])
yield _sse_event(
"content_block_start",
{
"type": "content_block_start",
"index": state["anthropic_index"],
"content_block": {
"type": "tool_use",
"id": state["id"] or f"toolu_{uuid.uuid4().hex[:24]}",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The uuid module is used here to generate a unique tool use ID, but it does not appear to be imported in this file. This will cause a NameError at runtime when this branch is hit.

"name": state["name"],
"input": {},
},
},
)
yield _sse_event(
"content_block_delta",
{
Expand Down Expand Up @@ -559,5 +593,5 @@ async def anthropic_messages_impl(raw_request: Request) -> Response:
anthropic_dict = _chat_response_to_anthropic(downstream, tool_name_mapping, requested_model)
except Exception as exc:
logger.error("Failed to translate response to Anthropic format: %s", exc)
return JSONResponse(_anthropic_error_response(500, str(exc)), status_code=500)
return _anthropic_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, str(exc))
return JSONResponse(anthropic_dict)
69 changes: 63 additions & 6 deletions lightllm/server/api_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,58 @@ def set_args(self, args: StartArgs):
app = FastAPI()
g_objs.app = app

_ACCESS_LOG_STATUS_COLORS = {2: "\033[32m", 3: "\033[36m", 4: "\033[33m", 5: "\033[31m"}
_ACCESS_LOG_STATUS_COLORS = {2: "\033[32m", 3: "\033[36m", 4: "\033[33m", 5: "\033[31m"}
_ACCESS_LOG_RESET = "\033[0m"


class _AccessLogMiddleware:
def __init__(self, app):
self.app = app

async def __call__(self, scope, receive, send):
if scope["type"] not in ("http", "websocket"):
await self.app(scope, receive, send)
return

status_holder = {"status": 0}

async def send_wrapper(message):
if message["type"] == "http.response.start":
status_holder["status"] = message["status"]
await send(message)

try:
await self.app(scope, receive, send_wrapper)
finally:
if scope["type"] == "http":
status = status_holder["status"]
msg = f"{scope['method']} {scope['path']} {status}"
color = _ACCESS_LOG_STATUS_COLORS.get(status // 100, "")
if color:
msg = color + msg + _ACCESS_LOG_RESET
logger.info(msg)


app.add_middleware(_AccessLogMiddleware)


def create_error_response(
status_code: HTTPStatus, message: str, err_type: str = None, param: str = None
) -> JSONResponse:
if err_type is None:
if status_code.value >= 500:
err_type = "InternalServerError"
elif status_code == HTTPStatus.NOT_FOUND:
err_type = "NotFoundError"
else:
err_type = "BadRequestError"

def create_error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
g_objs.metric_client.counter_inc("lightllm_request_failure")
return JSONResponse({"message": message}, status_code=status_code.value)
return JSONResponse(
{"error": {"message": message, "type": err_type, "param": param, "code": status_code.value}},
status_code=status_code.value,
)


@app.get("/liveness")
Expand Down Expand Up @@ -194,6 +242,8 @@ async def generate(request: Request) -> Response:
except ServerBusyError as e:
logger.error("%s", str(e), exc_info=True)
return create_error_response(HTTPStatus.SERVICE_UNAVAILABLE, str(e))
except ValueError as e:
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
except Exception as e:
logger.error("An error occurred: %s", str(e), exc_info=True)
return create_error_response(HTTPStatus.EXPECTATION_FAILED, str(e))
Expand All @@ -211,6 +261,8 @@ async def generate_stream(request: Request) -> Response:
except ServerBusyError as e:
logger.error("%s", str(e), exc_info=True)
return create_error_response(HTTPStatus.SERVICE_UNAVAILABLE, str(e))
except ValueError as e:
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
except Exception as e:
logger.error("An error occurred: %s", str(e), exc_info=True)
return create_error_response(HTTPStatus.EXPECTATION_FAILED, str(e))
Expand Down Expand Up @@ -251,7 +303,10 @@ async def chat_completions(request: ChatCompletionRequest, raw_request: Request)
HTTPStatus.EXPECTATION_FAILED, "service in pd mode dont recv reqs from http interface"
)

resp = await chat_completions_impl(request, raw_request)
try:
resp = await chat_completions_impl(request, raw_request)
except ValueError as e:
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
return resp


Expand All @@ -262,7 +317,10 @@ async def completions(request: CompletionRequest, raw_request: Request) -> Respo
HTTPStatus.EXPECTATION_FAILED, "service in pd mode dont recv reqs from http interface"
)

resp = await completions_impl(request, raw_request)
try:
resp = await completions_impl(request, raw_request)
except ValueError as e:
return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
return resp


Expand All @@ -278,7 +336,6 @@ async def anthropic_messages(raw_request: Request) -> Response:


@app.get("/v1/models", response_model=ModelListResponse)
@app.post("/v1/models", response_model=ModelListResponse)
async def get_models(raw_request: Request):
model_name = g_objs.args.model_name
max_model_len = g_objs.args.max_req_total_len
Expand All @@ -291,7 +348,7 @@ async def get_models(raw_request: Request):
id=model_name,
created=g_objs.model_created,
max_model_len=max_model_len,
owned_by=g_objs.args.model_owner,
owned_by=g_objs.args.model_owner or "lightllm",
)
]
)
Expand Down
6 changes: 5 additions & 1 deletion lightllm/server/api_lightllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,5 +148,9 @@ async def stream_results() -> AsyncGenerator[bytes, None]:

yield ("data:" + json.dumps(ret, ensure_ascii=False) + "\n\n").encode("utf-8")

from .api_openai import _safe_stream_wrapper

background_tasks = BackgroundTasks()
return StreamingResponse(stream_results(), media_type="text/event-stream", background=background_tasks)
return StreamingResponse(
_safe_stream_wrapper(stream_results()), media_type="text/event-stream", background=background_tasks
)
31 changes: 28 additions & 3 deletions lightllm/server/api_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class Function(BaseModel):
name: Optional[str] = None
description: Optional[str] = Field(default=None, examples=[None])
parameters: Optional[dict] = None
response: Optional[dict] = None


class Tool(BaseModel):
Expand Down Expand Up @@ -96,6 +95,7 @@ class ChatCompletionMessageGenericParam(BaseModel):
content: Union[str, List[MessageContent], None] = Field(default=None)
tool_call_id: Optional[str] = None
name: Optional[str] = None
reasoning: Optional[str] = None
reasoning_content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])

Expand All @@ -121,7 +121,7 @@ class CompletionRequest(BaseModel):
prompt: Union[str, List[str], List[int], List[List[int]]]
suffix: Optional[str] = None
max_tokens: Optional[int] = Field(
default=16384, deprecated="max_tokens is deprecated, please use max_completion_tokens instead"
default=65536, deprecated="max_tokens is deprecated, please use max_completion_tokens instead"
)
max_completion_tokens: Optional[int] = None
temperature: Optional[float] = 1.0
Expand Down Expand Up @@ -197,7 +197,7 @@ class ChatCompletionRequest(BaseModel):
stream_options: Optional[StreamOptions] = None
stop: Optional[Union[str, List[str]]] = None
max_tokens: Optional[int] = Field(
default=16384, deprecated="max_tokens is deprecated, please use max_completion_tokens instead"
default=65536, deprecated="max_tokens is deprecated, please use max_completion_tokens instead"
)
max_completion_tokens: Optional[int] = None
presence_penalty: Optional[float] = 0.0
Expand All @@ -221,6 +221,7 @@ class ChatCompletionRequest(BaseModel):
parallel_tool_calls: Optional[bool] = True

# OpenAI parameters for reasoning and others
reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
chat_template_kwargs: Optional[Dict] = None
separate_reasoning: Optional[bool] = True
stream_reasoning: Optional[bool] = False
Expand Down Expand Up @@ -278,18 +279,33 @@ def sync_thinking_chat_template_kwargs(self):
return self


class PromptTokensDetails(BaseModel):
cached_tokens: int = 0
audio_tokens: int = 0


class UsageInfo(BaseModel):
prompt_tokens: int = 0
completion_tokens: Optional[int] = 0
total_tokens: int = 0
prompt_tokens_details: Optional[PromptTokensDetails] = None


class ChatMessage(BaseModel):
role: Optional[str] = None
content: Optional[str] = None
reasoning: Optional[str] = None
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Renaming reasoning_content to reasoning in ChatMessage is a breaking change for existing API clients. To maintain backward compatibility while supporting the new field name, both fields should be present, or reasoning should be added as an alias.

Suggested change
reasoning: Optional[str] = None
reasoning: Optional[str] = None
reasoning_content: Optional[str] = None

reasoning_content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])

@model_validator(mode="after")
def _sync_reasoning_aliases(self):
if self.reasoning and not self.reasoning_content:
self.reasoning_content = self.reasoning
elif self.reasoning_content and not self.reasoning:
self.reasoning = self.reasoning_content
return self


class ChatCompletionResponseChoice(BaseModel):
index: int
Expand All @@ -314,8 +330,17 @@ class DeltaMessage(BaseModel):
role: Optional[str] = None
content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
reasoning: Optional[str] = None
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Renaming reasoning_content to reasoning in DeltaMessage is a breaking change for streaming clients. It is recommended to keep both fields to ensure compatibility with existing integrations.

Suggested change
reasoning: Optional[str] = None
reasoning: Optional[str] = None
reasoning_content: Optional[str] = None

reasoning_content: Optional[str] = None

@model_validator(mode="after")
def _sync_reasoning_aliases(self):
if self.reasoning and not self.reasoning_content:
self.reasoning_content = self.reasoning
elif self.reasoning_content and not self.reasoning:
self.reasoning = self.reasoning_content
return self


class ChatCompletionStreamResponseChoice(BaseModel):
index: int
Expand Down
Loading
Loading