textgen ollama code refactor. (#1158)

Remove Ollama folder since default openai API is able to consume Ollama service, modified Ollama readme and added UT.
#998
Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
This commit is contained in:
XinyuYe-Intel
2025-01-17 16:49:31 +08:00
committed by GitHub
parent ff094b555c
commit 28f9c31129
11 changed files with 83 additions and 118 deletions

View File

@@ -11,10 +11,6 @@ services:
build: build:
dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest} image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
llm-ollama:
build:
dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile
image: ${REGISTRY:-opea}/llm-ollama:${TAG:-latest}
llm-docsum: llm-docsum:
build: build:
dockerfile: comps/llms/src/doc-summarization/Dockerfile dockerfile: comps/llms/src/doc-summarization/Dockerfile

View File

@@ -244,7 +244,7 @@ curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Typ
### 3.4 Leverage fine-tuned model ### 3.4 Leverage fine-tuned model
After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`. After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/src/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
## 🚀4. Descriptions for Finetuning parameters ## 🚀4. Descriptions for Finetuning parameters

View File

@@ -57,18 +57,18 @@ curl --noproxy "*" http://localhost:11434/api/generate -d '{
## Build Docker Image ## Build Docker Image
```bash ```bash
cd GenAIComps/ cd ../../../../
docker build --no-cache -t opea/llm-ollama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ollama/langchain/Dockerfile . docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
``` ```
## Run the Ollama Microservice ## Run the Ollama Microservice
```bash ```bash
docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/llm-ollama:latest docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT="http://localhost:11434" -e LLM_MODEL_ID="llama3" opea/llm-textgen:latest
``` ```
## Consume the Ollama Microservice ## Consume the Ollama Microservice
```bash ```bash
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' -H 'Content-Type: application/json' curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -H 'Content-Type: application/json'
``` ```

View File

@@ -1,26 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
FROM python:3.11-slim
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
curl \
libgl1-mesa-glx \
libjemalloc-dev
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
USER user
COPY comps /home/user/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r /home/user/comps/llms/text-generation/ollama/langchain/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/llms/text-generation/ollama/langchain
ENTRYPOINT ["bash", "entrypoint.sh"]

View File

@@ -1,2 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

View File

@@ -1,8 +0,0 @@
#!/usr/bin/env bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
pip --no-cache-dir install -r requirements-runtime.txt
python llm.py

View File

@@ -1,60 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
from fastapi.responses import StreamingResponse
from langchain_community.llms import Ollama
from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
logger = CustomLogger("llm_ollama")
logflag = os.getenv("LOGFLAG", False)
@register_microservice(
name="opea_service@llm_ollama",
service_type=ServiceType.LLM,
endpoint="/v1/chat/completions",
host="0.0.0.0",
port=9000,
)
async def llm_generate(input: LLMParamsDoc):
if logflag:
logger.info(input)
ollama = Ollama(
base_url=ollama_endpoint,
model=input.model if input.model else model_name,
num_predict=input.max_tokens,
top_k=input.top_k,
top_p=input.top_p,
temperature=input.temperature,
repeat_penalty=input.repetition_penalty,
)
# assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3`
if input.stream:
async def stream_generator():
chat_response = ""
async for text in ollama.astream(input.query):
chat_response += text
chunk_repr = repr(text.encode("utf-8"))
if logflag:
logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
yield f"data: {chunk_repr}\n\n"
if logflag:
logger.info(f"[llm - chat_stream] stream response: {chat_response}")
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
response = await ollama.ainvoke(input.query)
if logflag:
logger.info(response)
return GeneratedDoc(text=response, prompt=input.query)
if __name__ == "__main__":
ollama_endpoint = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
model_name = os.getenv("OLLAMA_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
opea_microservices["opea_service@llm_ollama"].start()

View File

@@ -1,12 +0,0 @@
docarray[full]
fastapi
huggingface_hub
langchain
langchain-community
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
prometheus-fastapi-instrumentator
shortuuid
transformers
uvicorn

View File

@@ -0,0 +1,78 @@
#!/bin/bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
set -x
WORKPATH=$(dirname "$PWD")
LOG_PATH="$WORKPATH/tests"
ip_address=$(hostname -I | awk '{print $1}')
ollama_endpoint_port=11435
llm_port=9000
function build_docker_images() {
cd $WORKPATH
docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/llm:comps -f comps/llms/src/text-generation/Dockerfile .
if [ $? -ne 0 ]; then
echo "opea/llm built fail"
exit 1
else
echo "opea/llm built successful"
fi
}
function start_service() {
export llm_model=$1
docker run -d --name="test-comps-llm-ollama-endpoint" -e https_proxy=$https_proxy -p $ollama_endpoint_port:11434 ollama/ollama
export LLM_ENDPOINT="http://${ip_address}:${ollama_endpoint_port}"
sleep 5s
docker exec test-comps-llm-ollama-endpoint ollama pull $llm_model
sleep 20s
unset http_proxy
docker run -d --name="test-comps-llm-ollama-server" -p $llm_port:9000 --ipc=host -e LOGFLAG=True -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT=$LLM_ENDPOINT -e LLM_MODEL_ID=$llm_model opea/llm:comps
sleep 20s
}
function validate_microservice() {
result=$(http_proxy="" curl http://${ip_address}:${llm_port}/v1/chat/completions \
-X POST \
-d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
-H 'Content-Type: application/json')
if [[ $result == *"content"* ]]; then
echo "Result correct."
else
echo "Result wrong. Received was $result"
docker logs test-comps-llm-ollama-endpoint >> ${LOG_PATH}/llm-ollama.log
docker logs test-comps-llm-ollama-server >> ${LOG_PATH}/llm-server.log
exit 1
fi
}
function stop_docker() {
cid=$(docker ps -aq --filter "name=test-comps-llm-ollama*")
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
}
function main() {
stop_docker
build_docker_images
pip install --no-cache-dir openai
llm_models=(
llama3.2:1b
)
for model in "${llm_models[@]}"; do
start_service "${model}"
validate_microservice
stop_docker
done
echo y | docker system prune
}
main