Refactor CodeGen example with microservice (#152)

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
This commit is contained in:
lvliang-intel
2024-05-20 21:52:24 +08:00
committed by GitHub
parent d53eb69ac9
commit 6792bc10ca
21 changed files with 1160 additions and 146 deletions

View File

@@ -0,0 +1,49 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# SCRIPT USAGE NOTICE: By downloading and using any script file included
# with the associated software package (such as files with .bat, .cmd, or
# .JS extensions, Docker files, or any other type of file that, when executed,
# automatically downloads and/or installs files onto your system) (the “Script File”),
# it is your obligation to review the Script File to understand what files (e.g.,
# other software, AI models, AI Datasets) the Script File will download to your system
# (“Downloaded Files”). Furthermore, by downloading and using the Downloaded Files,
# even if they are installed through a silent install, you agree to any and all
# terms and conditions associated with such files, including but not limited to,
# license terms, notices, or disclaimers.
FROM langchain/langchain:latest
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
libgl1-mesa-glx \
libjemalloc-dev
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
USER user
COPY requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
ENV PYTHONPATH=/home/user:/home/user/codegen-app
WORKDIR /home/user/codegen-app
COPY codegen-app /home/user/codegen-app
SHELL ["/bin/bash", "-c"]

View File

@@ -0,0 +1,18 @@
#!/bin/bash
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
docker build . -t intel/gen-ai-examples:copilot --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy

View File

@@ -0,0 +1,45 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Code source from FastChat's OpenAI protocol:
https://github.com/lm-sys/FastChat/blob/main/fastchat/protocol/openai_api_protocol.py
"""
import time
from typing import Any, List, Optional, Union
import shortuuid
# pylint: disable=E0611
from pydantic import BaseModel, Field
class ChatCompletionRequest(BaseModel):
prompt: Union[str, List[Any]]
device: Optional[str] = "cpu"
temperature: Optional[float] = 0.7
top_p: Optional[float] = 1.0
top_k: Optional[int] = 1
repetition_penalty: Optional[float] = 1.0
max_new_tokens: Optional[int] = 128
stream: Optional[bool] = False
class ChatCompletionResponse(BaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{shortuuid.random()}")
object: str = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
response: str

View File

@@ -0,0 +1,166 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from typing import Optional
from fastapi import APIRouter, FastAPI
from fastapi.responses import RedirectResponse, StreamingResponse
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.pydantic_v1 import BaseModel
from openai_protocol import ChatCompletionRequest, ChatCompletionResponse
from starlette.middleware.cors import CORSMiddleware
app = FastAPI()
app.add_middleware(
CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
)
def filter_code_format(code):
language_prefixes = {
"go": "```go",
"c": "```c",
"cpp": "```cpp",
"java": "```java",
"python": "```python",
"typescript": "```typescript",
}
suffix = "\n```"
# Find the first occurrence of a language prefix
first_prefix_pos = len(code)
for prefix in language_prefixes.values():
pos = code.find(prefix)
if pos != -1 and pos < first_prefix_pos:
first_prefix_pos = pos + len(prefix) + 1
# Find the first occurrence of the suffix after the first language prefix
first_suffix_pos = code.find(suffix, first_prefix_pos + 1)
# Extract the code block
if first_prefix_pos != -1 and first_suffix_pos != -1:
return code[first_prefix_pos:first_suffix_pos]
elif first_prefix_pos != -1:
return code[first_prefix_pos:]
return code
class CodeGenAPIRouter(APIRouter):
def __init__(self, entrypoint) -> None:
super().__init__()
self.entrypoint = entrypoint
print(f"[codegen - router] Initializing API Router, entrypoint={entrypoint}")
# Define LLM
callbacks = [StreamingStdOutCallbackHandler()]
self.llm = HuggingFaceEndpoint(
endpoint_url=entrypoint,
max_new_tokens=1024,
top_k=10,
top_p=0.95,
typical_p=0.95,
temperature=0.01,
repetition_penalty=1.03,
streaming=True,
callbacks=callbacks,
)
print("[codegen - router] LLM initialized.")
def handle_chat_completion_request(self, request: ChatCompletionRequest):
try:
print(f"Predicting chat completion using prompt '{request.prompt}'")
if request.stream:
async def stream_generator():
for chunk in self.llm.stream(request.prompt):
yield f"data: {chunk.encode()}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
result = self.llm(request.prompt)
response = filter_code_format(result)
except Exception as e:
print(f"An error occurred: {e}")
else:
print("Chat completion finished.")
return ChatCompletionResponse(response=response)
tgi_endpoint = os.getenv("TGI_ENDPOINT", "http://localhost:8080")
router = CodeGenAPIRouter(tgi_endpoint)
def check_completion_request(request: BaseModel) -> Optional[str]:
if request.temperature is not None and request.temperature < 0:
return f"Param Error: {request.temperature} is less than the minimum of 0 --- 'temperature'"
if request.temperature is not None and request.temperature > 2:
return f"Param Error: {request.temperature} is greater than the maximum of 2 --- 'temperature'"
if request.top_p is not None and request.top_p < 0:
return f"Param Error: {request.top_p} is less than the minimum of 0 --- 'top_p'"
if request.top_p is not None and request.top_p > 1:
return f"Param Error: {request.top_p} is greater than the maximum of 1 --- 'top_p'"
if request.top_k is not None and (not isinstance(request.top_k, int)):
return f"Param Error: {request.top_k} is not valid under any of the given schemas --- 'top_k'"
if request.top_k is not None and request.top_k < 1:
return f"Param Error: {request.top_k} is greater than the minimum of 1 --- 'top_k'"
if request.max_new_tokens is not None and (not isinstance(request.max_new_tokens, int)):
return f"Param Error: {request.max_new_tokens} is not valid under any of the given schemas --- 'max_new_tokens'"
return None
# router /v1/code_generation only supports non-streaming mode.
@router.post("/v1/code_generation")
async def code_generation_endpoint(chat_request: ChatCompletionRequest):
ret = check_completion_request(chat_request)
if ret is not None:
raise RuntimeError("Invalid parameter.")
return router.handle_chat_completion_request(chat_request)
# router /v1/code_chat supports both non-streaming and streaming mode.
@router.post("/v1/code_chat")
async def code_chat_endpoint(chat_request: ChatCompletionRequest):
ret = check_completion_request(chat_request)
if ret is not None:
raise RuntimeError("Invalid parameter.")
return router.handle_chat_completion_request(chat_request)
app.include_router(router)
@app.get("/")
async def redirect_root_to_docs():
return RedirectResponse("/docs")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,5 @@
huggingface_hub
langchain==0.1.11
langchain-cli
pydantic==1.10.13
shortuuid