textgen ollama code refactor. (#1158)
Remove Ollama folder since default openai API is able to consume Ollama service, modified Ollama readme and added UT. #998 Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
This commit is contained in:
@@ -11,10 +11,6 @@ services:
|
|||||||
build:
|
build:
|
||||||
dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
|
dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
|
||||||
image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
|
image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
|
||||||
llm-ollama:
|
|
||||||
build:
|
|
||||||
dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile
|
|
||||||
image: ${REGISTRY:-opea}/llm-ollama:${TAG:-latest}
|
|
||||||
llm-docsum:
|
llm-docsum:
|
||||||
build:
|
build:
|
||||||
dockerfile: comps/llms/src/doc-summarization/Dockerfile
|
dockerfile: comps/llms/src/doc-summarization/Dockerfile
|
||||||
|
|||||||
@@ -244,7 +244,7 @@ curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Typ
|
|||||||
|
|
||||||
### 3.4 Leverage fine-tuned model
|
### 3.4 Leverage fine-tuned model
|
||||||
|
|
||||||
After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
|
After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/src/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
|
||||||
|
|
||||||
## 🚀4. Descriptions for Finetuning parameters
|
## 🚀4. Descriptions for Finetuning parameters
|
||||||
|
|
||||||
|
|||||||
@@ -57,18 +57,18 @@ curl --noproxy "*" http://localhost:11434/api/generate -d '{
|
|||||||
## Build Docker Image
|
## Build Docker Image
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd GenAIComps/
|
cd ../../../../
|
||||||
docker build --no-cache -t opea/llm-ollama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ollama/langchain/Dockerfile .
|
docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
## Run the Ollama Microservice
|
## Run the Ollama Microservice
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/llm-ollama:latest
|
docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT="http://localhost:11434" -e LLM_MODEL_ID="llama3" opea/llm-textgen:latest
|
||||||
```
|
```
|
||||||
|
|
||||||
## Consume the Ollama Microservice
|
## Consume the Ollama Microservice
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' -H 'Content-Type: application/json'
|
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -H 'Content-Type: application/json'
|
||||||
```
|
```
|
||||||
@@ -1,26 +0,0 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
FROM python:3.11-slim
|
|
||||||
|
|
||||||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
|
|
||||||
curl \
|
|
||||||
libgl1-mesa-glx \
|
|
||||||
libjemalloc-dev
|
|
||||||
|
|
||||||
RUN useradd -m -s /bin/bash user && \
|
|
||||||
mkdir -p /home/user && \
|
|
||||||
chown -R user /home/user/
|
|
||||||
|
|
||||||
USER user
|
|
||||||
|
|
||||||
COPY comps /home/user/comps
|
|
||||||
|
|
||||||
RUN pip install --no-cache-dir --upgrade pip setuptools && \
|
|
||||||
pip install --no-cache-dir -r /home/user/comps/llms/text-generation/ollama/langchain/requirements.txt
|
|
||||||
|
|
||||||
ENV PYTHONPATH=$PYTHONPATH:/home/user
|
|
||||||
|
|
||||||
WORKDIR /home/user/comps/llms/text-generation/ollama/langchain
|
|
||||||
|
|
||||||
ENTRYPOINT ["bash", "entrypoint.sh"]
|
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# Copyright (C) 2024 Intel Corporation
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
pip --no-cache-dir install -r requirements-runtime.txt
|
|
||||||
|
|
||||||
python llm.py
|
|
||||||
@@ -1,60 +0,0 @@
|
|||||||
# Copyright (C) 2024 Intel Corporation
|
|
||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
from fastapi.responses import StreamingResponse
|
|
||||||
from langchain_community.llms import Ollama
|
|
||||||
|
|
||||||
from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
|
|
||||||
|
|
||||||
logger = CustomLogger("llm_ollama")
|
|
||||||
logflag = os.getenv("LOGFLAG", False)
|
|
||||||
|
|
||||||
|
|
||||||
@register_microservice(
|
|
||||||
name="opea_service@llm_ollama",
|
|
||||||
service_type=ServiceType.LLM,
|
|
||||||
endpoint="/v1/chat/completions",
|
|
||||||
host="0.0.0.0",
|
|
||||||
port=9000,
|
|
||||||
)
|
|
||||||
async def llm_generate(input: LLMParamsDoc):
|
|
||||||
if logflag:
|
|
||||||
logger.info(input)
|
|
||||||
ollama = Ollama(
|
|
||||||
base_url=ollama_endpoint,
|
|
||||||
model=input.model if input.model else model_name,
|
|
||||||
num_predict=input.max_tokens,
|
|
||||||
top_k=input.top_k,
|
|
||||||
top_p=input.top_p,
|
|
||||||
temperature=input.temperature,
|
|
||||||
repeat_penalty=input.repetition_penalty,
|
|
||||||
)
|
|
||||||
# assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3`
|
|
||||||
if input.stream:
|
|
||||||
|
|
||||||
async def stream_generator():
|
|
||||||
chat_response = ""
|
|
||||||
async for text in ollama.astream(input.query):
|
|
||||||
chat_response += text
|
|
||||||
chunk_repr = repr(text.encode("utf-8"))
|
|
||||||
if logflag:
|
|
||||||
logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
|
|
||||||
yield f"data: {chunk_repr}\n\n"
|
|
||||||
if logflag:
|
|
||||||
logger.info(f"[llm - chat_stream] stream response: {chat_response}")
|
|
||||||
yield "data: [DONE]\n\n"
|
|
||||||
|
|
||||||
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
|
||||||
else:
|
|
||||||
response = await ollama.ainvoke(input.query)
|
|
||||||
if logflag:
|
|
||||||
logger.info(response)
|
|
||||||
return GeneratedDoc(text=response, prompt=input.query)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
ollama_endpoint = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
|
|
||||||
model_name = os.getenv("OLLAMA_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
|
|
||||||
opea_microservices["opea_service@llm_ollama"].start()
|
|
||||||
@@ -1 +0,0 @@
|
|||||||
langserve
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
docarray[full]
|
|
||||||
fastapi
|
|
||||||
huggingface_hub
|
|
||||||
langchain
|
|
||||||
langchain-community
|
|
||||||
opentelemetry-api
|
|
||||||
opentelemetry-exporter-otlp
|
|
||||||
opentelemetry-sdk
|
|
||||||
prometheus-fastapi-instrumentator
|
|
||||||
shortuuid
|
|
||||||
transformers
|
|
||||||
uvicorn
|
|
||||||
78
tests/llms/test_llms_text-generation_service_ollama.sh
Normal file
78
tests/llms/test_llms_text-generation_service_ollama.sh
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Copyright (C) 2024 Intel Corporation
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
set -x
|
||||||
|
|
||||||
|
WORKPATH=$(dirname "$PWD")
|
||||||
|
LOG_PATH="$WORKPATH/tests"
|
||||||
|
ip_address=$(hostname -I | awk '{print $1}')
|
||||||
|
ollama_endpoint_port=11435
|
||||||
|
llm_port=9000
|
||||||
|
|
||||||
|
function build_docker_images() {
|
||||||
|
cd $WORKPATH
|
||||||
|
docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/llm:comps -f comps/llms/src/text-generation/Dockerfile .
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "opea/llm built fail"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
echo "opea/llm built successful"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function start_service() {
|
||||||
|
export llm_model=$1
|
||||||
|
docker run -d --name="test-comps-llm-ollama-endpoint" -e https_proxy=$https_proxy -p $ollama_endpoint_port:11434 ollama/ollama
|
||||||
|
export LLM_ENDPOINT="http://${ip_address}:${ollama_endpoint_port}"
|
||||||
|
|
||||||
|
sleep 5s
|
||||||
|
docker exec test-comps-llm-ollama-endpoint ollama pull $llm_model
|
||||||
|
sleep 20s
|
||||||
|
|
||||||
|
unset http_proxy
|
||||||
|
docker run -d --name="test-comps-llm-ollama-server" -p $llm_port:9000 --ipc=host -e LOGFLAG=True -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT=$LLM_ENDPOINT -e LLM_MODEL_ID=$llm_model opea/llm:comps
|
||||||
|
sleep 20s
|
||||||
|
}
|
||||||
|
|
||||||
|
function validate_microservice() {
|
||||||
|
result=$(http_proxy="" curl http://${ip_address}:${llm_port}/v1/chat/completions \
|
||||||
|
-X POST \
|
||||||
|
-d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
|
||||||
|
-H 'Content-Type: application/json')
|
||||||
|
if [[ $result == *"content"* ]]; then
|
||||||
|
echo "Result correct."
|
||||||
|
else
|
||||||
|
echo "Result wrong. Received was $result"
|
||||||
|
docker logs test-comps-llm-ollama-endpoint >> ${LOG_PATH}/llm-ollama.log
|
||||||
|
docker logs test-comps-llm-ollama-server >> ${LOG_PATH}/llm-server.log
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function stop_docker() {
|
||||||
|
cid=$(docker ps -aq --filter "name=test-comps-llm-ollama*")
|
||||||
|
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
|
||||||
|
}
|
||||||
|
|
||||||
|
function main() {
|
||||||
|
|
||||||
|
stop_docker
|
||||||
|
build_docker_images
|
||||||
|
|
||||||
|
pip install --no-cache-dir openai
|
||||||
|
|
||||||
|
llm_models=(
|
||||||
|
llama3.2:1b
|
||||||
|
)
|
||||||
|
for model in "${llm_models[@]}"; do
|
||||||
|
start_service "${model}"
|
||||||
|
validate_microservice
|
||||||
|
stop_docker
|
||||||
|
done
|
||||||
|
|
||||||
|
echo y | docker system prune
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
main
|
||||||
Reference in New Issue
Block a user