textgen ollama code refactor. (#1158)
Remove Ollama folder since default openai API is able to consume Ollama service, modified Ollama readme and added UT. #998 Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
This commit is contained in:
@@ -11,10 +11,6 @@ services:
|
||||
build:
|
||||
dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
|
||||
image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
|
||||
llm-ollama:
|
||||
build:
|
||||
dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile
|
||||
image: ${REGISTRY:-opea}/llm-ollama:${TAG:-latest}
|
||||
llm-docsum:
|
||||
build:
|
||||
dockerfile: comps/llms/src/doc-summarization/Dockerfile
|
||||
|
||||
@@ -244,7 +244,7 @@ curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Typ
|
||||
|
||||
### 3.4 Leverage fine-tuned model
|
||||
|
||||
After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
|
||||
After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/src/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
|
||||
|
||||
## 🚀4. Descriptions for Finetuning parameters
|
||||
|
||||
|
||||
@@ -57,18 +57,18 @@ curl --noproxy "*" http://localhost:11434/api/generate -d '{
|
||||
## Build Docker Image
|
||||
|
||||
```bash
|
||||
cd GenAIComps/
|
||||
docker build --no-cache -t opea/llm-ollama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ollama/langchain/Dockerfile .
|
||||
cd ../../../../
|
||||
docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
|
||||
```
|
||||
|
||||
## Run the Ollama Microservice
|
||||
|
||||
```bash
|
||||
docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/llm-ollama:latest
|
||||
docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT="http://localhost:11434" -e LLM_MODEL_ID="llama3" opea/llm-textgen:latest
|
||||
```
|
||||
|
||||
## Consume the Ollama Microservice
|
||||
|
||||
```bash
|
||||
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' -H 'Content-Type: application/json'
|
||||
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -H 'Content-Type: application/json'
|
||||
```
|
||||
@@ -1,26 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
FROM python:3.11-slim
|
||||
|
||||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
|
||||
curl \
|
||||
libgl1-mesa-glx \
|
||||
libjemalloc-dev
|
||||
|
||||
RUN useradd -m -s /bin/bash user && \
|
||||
mkdir -p /home/user && \
|
||||
chown -R user /home/user/
|
||||
|
||||
USER user
|
||||
|
||||
COPY comps /home/user/comps
|
||||
|
||||
RUN pip install --no-cache-dir --upgrade pip setuptools && \
|
||||
pip install --no-cache-dir -r /home/user/comps/llms/text-generation/ollama/langchain/requirements.txt
|
||||
|
||||
ENV PYTHONPATH=$PYTHONPATH:/home/user
|
||||
|
||||
WORKDIR /home/user/comps/llms/text-generation/ollama/langchain
|
||||
|
||||
ENTRYPOINT ["bash", "entrypoint.sh"]
|
||||
@@ -1,2 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
@@ -1,8 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
pip --no-cache-dir install -r requirements-runtime.txt
|
||||
|
||||
python llm.py
|
||||
@@ -1,60 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import os
|
||||
|
||||
from fastapi.responses import StreamingResponse
|
||||
from langchain_community.llms import Ollama
|
||||
|
||||
from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
|
||||
|
||||
logger = CustomLogger("llm_ollama")
|
||||
logflag = os.getenv("LOGFLAG", False)
|
||||
|
||||
|
||||
@register_microservice(
|
||||
name="opea_service@llm_ollama",
|
||||
service_type=ServiceType.LLM,
|
||||
endpoint="/v1/chat/completions",
|
||||
host="0.0.0.0",
|
||||
port=9000,
|
||||
)
|
||||
async def llm_generate(input: LLMParamsDoc):
|
||||
if logflag:
|
||||
logger.info(input)
|
||||
ollama = Ollama(
|
||||
base_url=ollama_endpoint,
|
||||
model=input.model if input.model else model_name,
|
||||
num_predict=input.max_tokens,
|
||||
top_k=input.top_k,
|
||||
top_p=input.top_p,
|
||||
temperature=input.temperature,
|
||||
repeat_penalty=input.repetition_penalty,
|
||||
)
|
||||
# assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3`
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
chat_response = ""
|
||||
async for text in ollama.astream(input.query):
|
||||
chat_response += text
|
||||
chunk_repr = repr(text.encode("utf-8"))
|
||||
if logflag:
|
||||
logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
|
||||
yield f"data: {chunk_repr}\n\n"
|
||||
if logflag:
|
||||
logger.info(f"[llm - chat_stream] stream response: {chat_response}")
|
||||
yield "data: [DONE]\n\n"
|
||||
|
||||
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
||||
else:
|
||||
response = await ollama.ainvoke(input.query)
|
||||
if logflag:
|
||||
logger.info(response)
|
||||
return GeneratedDoc(text=response, prompt=input.query)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ollama_endpoint = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
|
||||
model_name = os.getenv("OLLAMA_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
|
||||
opea_microservices["opea_service@llm_ollama"].start()
|
||||
@@ -1 +0,0 @@
|
||||
langserve
|
||||
@@ -1,12 +0,0 @@
|
||||
docarray[full]
|
||||
fastapi
|
||||
huggingface_hub
|
||||
langchain
|
||||
langchain-community
|
||||
opentelemetry-api
|
||||
opentelemetry-exporter-otlp
|
||||
opentelemetry-sdk
|
||||
prometheus-fastapi-instrumentator
|
||||
shortuuid
|
||||
transformers
|
||||
uvicorn
|
||||
78
tests/llms/test_llms_text-generation_service_ollama.sh
Normal file
78
tests/llms/test_llms_text-generation_service_ollama.sh
Normal file
@@ -0,0 +1,78 @@
|
||||
#!/bin/bash
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
set -x
|
||||
|
||||
WORKPATH=$(dirname "$PWD")
|
||||
LOG_PATH="$WORKPATH/tests"
|
||||
ip_address=$(hostname -I | awk '{print $1}')
|
||||
ollama_endpoint_port=11435
|
||||
llm_port=9000
|
||||
|
||||
function build_docker_images() {
|
||||
cd $WORKPATH
|
||||
docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/llm:comps -f comps/llms/src/text-generation/Dockerfile .
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "opea/llm built fail"
|
||||
exit 1
|
||||
else
|
||||
echo "opea/llm built successful"
|
||||
fi
|
||||
}
|
||||
|
||||
function start_service() {
|
||||
export llm_model=$1
|
||||
docker run -d --name="test-comps-llm-ollama-endpoint" -e https_proxy=$https_proxy -p $ollama_endpoint_port:11434 ollama/ollama
|
||||
export LLM_ENDPOINT="http://${ip_address}:${ollama_endpoint_port}"
|
||||
|
||||
sleep 5s
|
||||
docker exec test-comps-llm-ollama-endpoint ollama pull $llm_model
|
||||
sleep 20s
|
||||
|
||||
unset http_proxy
|
||||
docker run -d --name="test-comps-llm-ollama-server" -p $llm_port:9000 --ipc=host -e LOGFLAG=True -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT=$LLM_ENDPOINT -e LLM_MODEL_ID=$llm_model opea/llm:comps
|
||||
sleep 20s
|
||||
}
|
||||
|
||||
function validate_microservice() {
|
||||
result=$(http_proxy="" curl http://${ip_address}:${llm_port}/v1/chat/completions \
|
||||
-X POST \
|
||||
-d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
|
||||
-H 'Content-Type: application/json')
|
||||
if [[ $result == *"content"* ]]; then
|
||||
echo "Result correct."
|
||||
else
|
||||
echo "Result wrong. Received was $result"
|
||||
docker logs test-comps-llm-ollama-endpoint >> ${LOG_PATH}/llm-ollama.log
|
||||
docker logs test-comps-llm-ollama-server >> ${LOG_PATH}/llm-server.log
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function stop_docker() {
|
||||
cid=$(docker ps -aq --filter "name=test-comps-llm-ollama*")
|
||||
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
|
||||
}
|
||||
|
||||
function main() {
|
||||
|
||||
stop_docker
|
||||
build_docker_images
|
||||
|
||||
pip install --no-cache-dir openai
|
||||
|
||||
llm_models=(
|
||||
llama3.2:1b
|
||||
)
|
||||
for model in "${llm_models[@]}"; do
|
||||
start_service "${model}"
|
||||
validate_microservice
|
||||
stop_docker
|
||||
done
|
||||
|
||||
echo y | docker system prune
|
||||
|
||||
}
|
||||
|
||||
main
|
||||
Reference in New Issue
Block a user