openai的调用api几乎成为了实质性的大模型社区的调用标准,你看不论是阿里的灵积,智谱,together,vllm,ollama,fastchat等都支持openai的调用方式,所以这也是大势所趋,
有时候我们想做个中间层,底层调用大模型,上层提供业务服务,特别是许多公司的多节点的agent,如果我们都保持一致那么统一性就很好了
代码如下,代码不是很精细,个别小细节需要改下,但是整体不影响使用和借鉴:
import time
import json
import asyncio
from typing import List, Optionalimport uvicorn
from openai import OpenAI
from fastapi import FastAPI
from pydantic import BaseModel
from starlette.responses import StreamingResponseapp = FastAPI(title="OpenAI-compatible API")class ChatMessage(BaseModel):role: strcontent: strclass ChatCompletionRequest(BaseModel):model: str = "mock-gpt-model"messages: List[ChatMessage]max_tokens: Optional[int] = 512temperature: Optional[float] = 0.1stream: Optional[bool] = Falseclass CallOpenAI:'''调用底层大模型的部分 '''def __init__(self):self.client = OpenAI(api_key="EMPTY",base_url="http://127.0.0.1:9001/v1")def invoke(self,messages):messages=[{'role':'user','content':messages}]completion = self.client.chat.completions.create(model='Qwen2-72-gptq-int4',messages=messages,
# temperature = 0.5,max_tokens=500,
# top_p=1.0,
# presence_penalty=0.0,
# frequency_penalty = 0.0stream=True)return completion
model = CallOpenAI()async def _resp_async_generator1(text_resp: str):response = model.invoke(text_resp)stream_messages = ''for chunk in response:content = chunk.choices[0].delta.contentstream_message = contentif not content:stream_message = ''stream_messages += stream_messagechunk = {"id": 1,"object": "chat.completion.chunk","created": time.time(),"model": "blah","choices": [{"delta": {"content": stream_message}}],}yield f"data: {json.dumps(chunk)}\n\n"await asyncio.sleep(0)yield "data: [DONE]\n\n"@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):if request.messages:resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].contentelse:resp_content = "As a mock AI Assitant, I can only echo your last message, but there wasn't one!"resp_content = '请基于春天写一篇1000字的文章'if request.stream:return StreamingResponse(_resp_async_generator1(resp_content), media_type="application/x-ndjson")return {"id": "1337","object": "chat.completion","created": time.time(),"model": request.model,"choices": [{"message": ChatMessage(role="assistant", content=resp_content) }]}if __name__ == '__main__':uvicorn.run(app, host="0.0.0.0", port=8000)
拉起如下
写个代码测试下:
from openai import OpenAI# init client and connect to localhost server
client = OpenAI(api_key="fake-api-key",base_url="http://localhost:8000/v1/" # change the default port if needed
)stream = client.chat.completions.create(model="mock-gpt-model",messages=[{"role": "user", "content": "Say this is a test"}],stream=True,
)for chunk in stream:print(chunk.choices[0].delta.content or "",end='',flush=True)
结果如下图进行流式输出