add initial examples

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
2024-03-21 10:17:09 +08:00
parent bc7c18f68d
commit fabff168ff
147 changed files with 23216 additions and 0 deletions
--- a/CodeGen/codegen/Dockerfile
+++ b/CodeGen/codegen/Dockerfile
@@ -0,0 +1,25 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM langchain/langchain
+RUN apt-get update && apt-get -y install libgl1-mesa-glx
+RUN pip install -U langchain-cli pydantic==1.10.13
+RUN pip install langchain==0.1.11
+RUN pip install shortuuid
+RUN pip install huggingface_hub
+RUN mkdir -p /ws
+ENV PYTHONPATH=/ws
+COPY codegen-app /codegen-app
+WORKDIR /codegen-app
+CMD ["/bin/bash"]
--- a/CodeGen/codegen/build_docker.sh
+++ b/CodeGen/codegen/build_docker.sh
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+docker build . -t copilot:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
--- a/CodeGen/codegen/codegen-app/openai_protocol.py
+++ b/CodeGen/codegen/codegen-app/openai_protocol.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Code source from FastChat's OpenAI protocol: 
+https://github.com/lm-sys/FastChat/blob/main/fastchat/protocol/openai_api_protocol.py
+"""
+from typing import Optional, List, Any, Union
+import time
+import shortuuid
+# pylint: disable=E0611
+from pydantic import BaseModel, Field
+
+class ChatCompletionRequest(BaseModel):
+    prompt: Union[str, List[Any]]
+    device: Optional[str] = 'cpu'
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    top_k: Optional[int] = 1
+    repetition_penalty: Optional[float] = 1.0
+    max_new_tokens: Optional[int] = 128
+    stream: Optional[bool] = False
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{shortuuid.random()}")
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    response: str
--- a/CodeGen/codegen/codegen-app/server.py
+++ b/CodeGen/codegen/codegen-app/server.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import requests
+import json
+import types
+from concurrent import futures
+from typing import Optional
+from fastapi import FastAPI, APIRouter
+from fastapi.responses import RedirectResponse, StreamingResponse
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_core.pydantic_v1 import BaseModel
+from starlette.middleware.cors import CORSMiddleware
+from openai_protocol import ChatCompletionRequest, ChatCompletionResponse
+
+app = FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"])
+
+class CodeGenAPIRouter(APIRouter):
+    def __init__(self, entrypoint) -> None:
+        super().__init__()
+        self.entrypoint = entrypoint
+        print(f"[codegen - router] Initializing API Router, entrypoint={entrypoint}")
+
+        # Define LLM
+        self.llm = HuggingFaceEndpoint(
+            endpoint_url=entrypoint,
+            max_new_tokens=512,
+            top_k=10,
+            top_p=0.95,
+            typical_p=0.95,
+            temperature=0.01,
+            repetition_penalty=1.03,
+            streaming=True,
+        )
+        print("[codegen - router] LLM initialized.")
+
+    def is_generator(self, obj):
+        return isinstance(obj, types.GeneratorType)
+
+    def handle_chat_completion_request(self, request: ChatCompletionRequest):
+        try:
+            print(f"Predicting chat completion using prompt '{request.prompt}'")
+            buffered_texts = ""
+            if request.stream:
+                generator = self.llm(request.prompt, callbacks=[StreamingStdOutCallbackHandler()])
+                if not self.is_generator(generator):
+                    generator = (generator,)
+                def stream_generator():
+                    nonlocal buffered_texts
+                    for output in generator:
+                        yield f"data: {output}\n\n"
+                    yield f"data: [DONE]\n\n"
+                return StreamingResponse(stream_generator(), media_type="text/event-stream")
+            else:
+                response = self.llm(request.prompt)
+        except Exception as e:
+            print(f"An error occurred: {e}")
+        else:
+            print("Chat completion finished.")
+            return ChatCompletionResponse(response=response)
+
+tgi_endpoint = os.getenv("TGI_ENDPOINT", "http://localhost:8080")
+router = CodeGenAPIRouter(tgi_endpoint)
+
+app.include_router(router)
+
+def check_completion_request(request: BaseModel) -> Optional[str]:
+    if request.temperature is not None and request.temperature < 0:
+        return f"Param Error: {request.temperature} is less than the minimum of 0 --- 'temperature'"
+
+    if request.temperature is not None and request.temperature > 2:
+        return f"Param Error: {request.temperature} is greater than the maximum of 2 --- 'temperature'"
+
+    if request.top_p is not None and request.top_p < 0:
+        return f"Param Error: {request.top_p} is less than the minimum of 0 --- 'top_p'"
+
+    if request.top_p is not None and request.top_p > 1:
+        return f"Param Error: {request.top_p} is greater than the maximum of 1 --- 'top_p'"
+
+    if request.top_k is not None and (not isinstance(request.top_k, int)):
+        return f"Param Error: {request.top_k} is not valid under any of the given schemas --- 'top_k'"
+
+    if request.top_k is not None and request.top_k < 1:
+        return f"Param Error: {request.top_k} is greater than the minimum of 1 --- 'top_k'"
+
+    if request.max_new_tokens is not None and (not isinstance(request.max_new_tokens, int)):
+        return f"Param Error: {request.max_new_tokens} is not valid under any of the given schemas --- 'max_new_tokens'"
+
+    return None
+
+def filter_code_format(code):
+    language_prefixes = {
+        "go": "```go",
+        "c": "```c",
+        "cpp": "```cpp",
+        "java": "```java",
+        "python": "```python",
+        "typescript": "```typescript"
+    }
+    suffix = "\n```"
+
+    # Find the first occurrence of a language prefix
+    first_prefix_pos = len(code)
+    for prefix in language_prefixes.values():
+        pos = code.find(prefix)
+        if pos != -1 and pos < first_prefix_pos:
+            first_prefix_pos = pos + len(prefix) + 1
+
+    # Find the first occurrence of the suffix after the first language prefix
+    first_suffix_pos = code.find(suffix, first_prefix_pos + 1)
+
+    # Extract the code block
+    if first_prefix_pos != -1 and first_suffix_pos != -1:
+        return code[first_prefix_pos:first_suffix_pos]
+    elif first_prefix_pos != -1:
+        return code[first_prefix_pos:]
+
+    return code
+
+# router /v1/code_generation only supports non-streaming mode.
+@router.post("/v1/code_generation")
+async def code_generation_endpoint(chat_request: ChatCompletionRequest):
+    if router.use_deepspeed:
+        responses = []
+
+        def send_request(port):
+            try:
+                url = f'http://{router.host}:{port}/v1/code_generation'
+                response = requests.post(url, json=chat_request.dict())
+                response.raise_for_status()
+                json_response = json.loads(response.content)
+                cleaned_code = filter_code_format(json_response['response'])
+                chat_completion_response = ChatCompletionResponse(response=cleaned_code)
+                responses.append(chat_completion_response)
+            except requests.exceptions.RequestException as e:
+                print(f"Error sending/receiving on port {port}: {e}")
+
+        with futures.ThreadPoolExecutor(max_workers=router.world_size) as executor:
+            worker_ports = [router.port + i + 1 for i in range(router.world_size)]
+            executor.map(send_request, worker_ports)
+        if responses:
+            return responses[0]
+    else:
+        ret = check_completion_request(chat_request)
+        if ret is not None:
+            raise RuntimeError("Invalid parameter.")
+        return router.handle_chat_completion_request(chat_request)
+
+# router /v1/code_chat supports both non-streaming and streaming mode.
+@router.post("/v1/code_chat")
+async def code_chat_endpoint(chat_request: ChatCompletionRequest):
+    if router.use_deepspeed:
+        if chat_request.stream:
+            responses = []
+            def generate_stream(port):
+                url = f'http://{router.host}:{port}/v1/code_generation'
+                response = requests.post(url, json=chat_request.dict(), stream=True, timeout=1000)
+                responses.append(response)
+            with futures.ThreadPoolExecutor(max_workers=router.world_size) as executor:
+                worker_ports = [router.port + i + 1 for i in range(router.world_size)]
+                executor.map(generate_stream, worker_ports)
+
+            while not responses:
+                pass
+            def generate():
+                if responses[0]:
+                    for chunk in responses[0].iter_lines(decode_unicode=False, delimiter=b"\0"):
+                        if chunk:
+                            yield f"data: {chunk}\n\n"
+                    yield f"data: [DONE]\n\n"
+
+            return StreamingResponse(generate(), media_type="text/event-stream")
+        else:
+            responses = []
+
+            def send_request(port):
+                try:
+                    url = f'http://{router.host}:{port}/v1/code_generation'
+                    response = requests.post(url, json=chat_request.dict())
+                    response.raise_for_status()
+                    json_response = json.loads(response.content)
+                    chat_completion_response = ChatCompletionResponse(response=json_response['response'])
+                    responses.append(chat_completion_response)
+                except requests.exceptions.RequestException as e:
+                    print(f"Error sending/receiving on port {port}: {e}")
+
+            with futures.ThreadPoolExecutor(max_workers=router.world_size) as executor:
+                worker_ports = [router.port + i + 1 for i in range(router.world_size)]
+                executor.map(send_request, worker_ports)
+            if responses:
+                return responses[0]
+    else:
+        ret = check_completion_request(chat_request)
+        if ret is not None:
+            raise RuntimeError("Invalid parameter.")
+        return router.handle_chat_completion_request(chat_request)
+
+@app.get("/")
+async def redirect_root_to_docs():
+    return RedirectResponse("/docs")
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+