add initial examples

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
This commit is contained in:
lvliang-intel
2024-03-21 10:17:09 +08:00
parent bc7c18f68d
commit fabff168ff
147 changed files with 23216 additions and 0 deletions

View File

@@ -0,0 +1,25 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM langchain/langchain
RUN apt-get update && apt-get -y install libgl1-mesa-glx
RUN pip install -U langchain-cli pydantic==1.10.13
RUN pip install langchain==0.1.11
RUN pip install shortuuid
RUN pip install huggingface_hub
RUN mkdir -p /ws
ENV PYTHONPATH=/ws
COPY codegen-app /codegen-app
WORKDIR /codegen-app
CMD ["/bin/bash"]

View File

@@ -0,0 +1,17 @@
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
docker build . -t copilot:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy

View File

@@ -0,0 +1,42 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Code source from FastChat's OpenAI protocol:
https://github.com/lm-sys/FastChat/blob/main/fastchat/protocol/openai_api_protocol.py
"""
from typing import Optional, List, Any, Union
import time
import shortuuid
# pylint: disable=E0611
from pydantic import BaseModel, Field
class ChatCompletionRequest(BaseModel):
prompt: Union[str, List[Any]]
device: Optional[str] = 'cpu'
temperature: Optional[float] = 0.7
top_p: Optional[float] = 1.0
top_k: Optional[int] = 1
repetition_penalty: Optional[float] = 1.0
max_new_tokens: Optional[int] = 128
stream: Optional[bool] = False
class ChatCompletionResponse(BaseModel):
id: str = Field(default_factory=lambda: f"chatcmpl-{shortuuid.random()}")
object: str = "chat.completion"
created: int = Field(default_factory=lambda: int(time.time()))
response: str

View File

@@ -0,0 +1,229 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import requests
import json
import types
from concurrent import futures
from typing import Optional
from fastapi import FastAPI, APIRouter
from fastapi.responses import RedirectResponse, StreamingResponse
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.pydantic_v1 import BaseModel
from starlette.middleware.cors import CORSMiddleware
from openai_protocol import ChatCompletionRequest, ChatCompletionResponse
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"])
class CodeGenAPIRouter(APIRouter):
def __init__(self, entrypoint) -> None:
super().__init__()
self.entrypoint = entrypoint
print(f"[codegen - router] Initializing API Router, entrypoint={entrypoint}")
# Define LLM
self.llm = HuggingFaceEndpoint(
endpoint_url=entrypoint,
max_new_tokens=512,
top_k=10,
top_p=0.95,
typical_p=0.95,
temperature=0.01,
repetition_penalty=1.03,
streaming=True,
)
print("[codegen - router] LLM initialized.")
def is_generator(self, obj):
return isinstance(obj, types.GeneratorType)
def handle_chat_completion_request(self, request: ChatCompletionRequest):
try:
print(f"Predicting chat completion using prompt '{request.prompt}'")
buffered_texts = ""
if request.stream:
generator = self.llm(request.prompt, callbacks=[StreamingStdOutCallbackHandler()])
if not self.is_generator(generator):
generator = (generator,)
def stream_generator():
nonlocal buffered_texts
for output in generator:
yield f"data: {output}\n\n"
yield f"data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = self.llm(request.prompt)
except Exception as e:
print(f"An error occurred: {e}")
else:
print("Chat completion finished.")
return ChatCompletionResponse(response=response)
tgi_endpoint = os.getenv("TGI_ENDPOINT", "http://localhost:8080")
router = CodeGenAPIRouter(tgi_endpoint)
app.include_router(router)
def check_completion_request(request: BaseModel) -> Optional[str]:
if request.temperature is not None and request.temperature < 0:
return f"Param Error: {request.temperature} is less than the minimum of 0 --- 'temperature'"
if request.temperature is not None and request.temperature > 2:
return f"Param Error: {request.temperature} is greater than the maximum of 2 --- 'temperature'"
if request.top_p is not None and request.top_p < 0:
return f"Param Error: {request.top_p} is less than the minimum of 0 --- 'top_p'"
if request.top_p is not None and request.top_p > 1:
return f"Param Error: {request.top_p} is greater than the maximum of 1 --- 'top_p'"
if request.top_k is not None and (not isinstance(request.top_k, int)):
return f"Param Error: {request.top_k} is not valid under any of the given schemas --- 'top_k'"
if request.top_k is not None and request.top_k < 1:
return f"Param Error: {request.top_k} is greater than the minimum of 1 --- 'top_k'"
if request.max_new_tokens is not None and (not isinstance(request.max_new_tokens, int)):
return f"Param Error: {request.max_new_tokens} is not valid under any of the given schemas --- 'max_new_tokens'"
return None
def filter_code_format(code):
language_prefixes = {
"go": "```go",
"c": "```c",
"cpp": "```cpp",
"java": "```java",
"python": "```python",
"typescript": "```typescript"
}
suffix = "\n```"
# Find the first occurrence of a language prefix
first_prefix_pos = len(code)
for prefix in language_prefixes.values():
pos = code.find(prefix)
if pos != -1 and pos < first_prefix_pos:
first_prefix_pos = pos + len(prefix) + 1
# Find the first occurrence of the suffix after the first language prefix
first_suffix_pos = code.find(suffix, first_prefix_pos + 1)
# Extract the code block
if first_prefix_pos != -1 and first_suffix_pos != -1:
return code[first_prefix_pos:first_suffix_pos]
elif first_prefix_pos != -1:
return code[first_prefix_pos:]
return code
# router /v1/code_generation only supports non-streaming mode.
@router.post("/v1/code_generation")
async def code_generation_endpoint(chat_request: ChatCompletionRequest):
if router.use_deepspeed:
responses = []
def send_request(port):
try:
url = f'http://{router.host}:{port}/v1/code_generation'
response = requests.post(url, json=chat_request.dict())
response.raise_for_status()
json_response = json.loads(response.content)
cleaned_code = filter_code_format(json_response['response'])
chat_completion_response = ChatCompletionResponse(response=cleaned_code)
responses.append(chat_completion_response)
except requests.exceptions.RequestException as e:
print(f"Error sending/receiving on port {port}: {e}")
with futures.ThreadPoolExecutor(max_workers=router.world_size) as executor:
worker_ports = [router.port + i + 1 for i in range(router.world_size)]
executor.map(send_request, worker_ports)
if responses:
return responses[0]
else:
ret = check_completion_request(chat_request)
if ret is not None:
raise RuntimeError("Invalid parameter.")
return router.handle_chat_completion_request(chat_request)
# router /v1/code_chat supports both non-streaming and streaming mode.
@router.post("/v1/code_chat")
async def code_chat_endpoint(chat_request: ChatCompletionRequest):
if router.use_deepspeed:
if chat_request.stream:
responses = []
def generate_stream(port):
url = f'http://{router.host}:{port}/v1/code_generation'
response = requests.post(url, json=chat_request.dict(), stream=True, timeout=1000)
responses.append(response)
with futures.ThreadPoolExecutor(max_workers=router.world_size) as executor:
worker_ports = [router.port + i + 1 for i in range(router.world_size)]
executor.map(generate_stream, worker_ports)
while not responses:
pass
def generate():
if responses[0]:
for chunk in responses[0].iter_lines(decode_unicode=False, delimiter=b"\0"):
if chunk:
yield f"data: {chunk}\n\n"
yield f"data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
else:
responses = []
def send_request(port):
try:
url = f'http://{router.host}:{port}/v1/code_generation'
response = requests.post(url, json=chat_request.dict())
response.raise_for_status()
json_response = json.loads(response.content)
chat_completion_response = ChatCompletionResponse(response=json_response['response'])
responses.append(chat_completion_response)
except requests.exceptions.RequestException as e:
print(f"Error sending/receiving on port {port}: {e}")
with futures.ThreadPoolExecutor(max_workers=router.world_size) as executor:
worker_ports = [router.port + i + 1 for i in range(router.world_size)]
executor.map(send_request, worker_ports)
if responses:
return responses[0]
else:
ret = check_completion_request(chat_request)
if ret is not None:
raise RuntimeError("Invalid parameter.")
return router.handle_chat_completion_request(chat_request)
@app.get("/")
async def redirect_root_to_docs():
return RedirectResponse("/docs")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)