textgen ollama code refactor. (#1158)

Remove Ollama folder since default openai API is able to consume Ollama service, modified Ollama readme and added UT. #998 Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
2025-01-17 16:49:31 +08:00
parent ff094b555c
commit 28f9c31129
11 changed files with 83 additions and 118 deletions
--- a/.github/workflows/docker/compose/llms-compose.yaml
+++ b/.github/workflows/docker/compose/llms-compose.yaml
@@ -11,10 +11,6 @@ services:
    build:
      dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
    image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
  llm-ollama:
    build:
      dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile
    image: ${REGISTRY:-opea}/llm-ollama:${TAG:-latest}
  llm-docsum:
    build:
      dockerfile: comps/llms/src/doc-summarization/Dockerfile
--- a/comps/finetuning/src/README.md
+++ b/comps/finetuning/src/README.md
@@ -244,7 +244,7 @@ curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Typ
 ### 3.4 Leverage fine-tuned model
-After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
+After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/src/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
 ## 🚀4. Descriptions for Finetuning parameters
--- a/comps/llms/text-generation/ollama/langchain/README.md
+++ b/comps/llms/text-generation/ollama/langchain/README.md
@@ -57,18 +57,18 @@ curl --noproxy "*" http://localhost:11434/api/generate -d '{
 ## Build Docker Image
 ```bash
-cd GenAIComps/
+cd ../../../../
-docker build --no-cache -t opea/llm-ollama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ollama/langchain/Dockerfile .
+docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
 ```
 ## Run the Ollama Microservice
 ```bash
-docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/llm-ollama:latest
+docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT="http://localhost:11434" -e LLM_MODEL_ID="llama3" opea/llm-textgen:latest
 ```
 ## Consume the Ollama Microservice
 ```bash
-curl http://127.0.0.1:9000/v1/chat/completions  -X POST   -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}'   -H 'Content-Type: application/json'
+curl http://127.0.0.1:9000/v1/chat/completions  -X POST   -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -H 'Content-Type: application/json'
 ```
--- a/comps/llms/src/text-generation/README_textgen.md
+++ b/comps/llms/src/text-generation/README_textgen.md
--- a/comps/llms/text-generation/ollama/langchain/Dockerfile
+++ b/comps/llms/text-generation/ollama/langchain/Dockerfile
@@ -1,26 +0,0 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 FROM python:3.11-slim
 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
    curl \
    libgl1-mesa-glx \
    libjemalloc-dev
 RUN useradd -m -s /bin/bash user && \
    mkdir -p /home/user && \
    chown -R user /home/user/
 USER user
 COPY comps /home/user/comps
 RUN pip install --no-cache-dir --upgrade pip setuptools && \
    pip install --no-cache-dir -r /home/user/comps/llms/text-generation/ollama/langchain/requirements.txt
 ENV PYTHONPATH=$PYTHONPATH:/home/user
 WORKDIR /home/user/comps/llms/text-generation/ollama/langchain
 ENTRYPOINT ["bash", "entrypoint.sh"]
--- a/comps/llms/text-generation/ollama/langchain/init.py
+++ b/comps/llms/text-generation/ollama/langchain/init.py
@@ -1,2 +0,0 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
--- a/comps/llms/text-generation/ollama/langchain/entrypoint.sh
+++ b/comps/llms/text-generation/ollama/langchain/entrypoint.sh
@@ -1,8 +0,0 @@
 #!/usr/bin/env bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 pip --no-cache-dir install -r requirements-runtime.txt
 python llm.py
--- a/comps/llms/text-generation/ollama/langchain/llm.py
+++ b/comps/llms/text-generation/ollama/langchain/llm.py
@@ -1,60 +0,0 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 import os
 from fastapi.responses import StreamingResponse
 from langchain_community.llms import Ollama
 from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
 logger = CustomLogger("llm_ollama")
 logflag = os.getenv("LOGFLAG", False)
@register_microservice(
    name="opea_service@llm_ollama",
    service_type=ServiceType.LLM,
    endpoint="/v1/chat/completions",
    host="0.0.0.0",
    port=9000,
 )
 async def llm_generate(input: LLMParamsDoc):
    if logflag:
        logger.info(input)
    ollama = Ollama(
        base_url=ollama_endpoint,
        model=input.model if input.model else model_name,
        num_predict=input.max_tokens,
        top_k=input.top_k,
        top_p=input.top_p,
        temperature=input.temperature,
        repeat_penalty=input.repetition_penalty,
    )
    # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3`
    if input.stream:
        async def stream_generator():
            chat_response = ""
            async for text in ollama.astream(input.query):
                chat_response += text
                chunk_repr = repr(text.encode("utf-8"))
                if logflag:
                    logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
                yield f"data: {chunk_repr}\n\n"
            if logflag:
                logger.info(f"[llm - chat_stream] stream response: {chat_response}")
            yield "data: [DONE]\n\n"
        return StreamingResponse(stream_generator(), media_type="text/event-stream")
    else:
        response = await ollama.ainvoke(input.query)
        if logflag:
            logger.info(response)
        return GeneratedDoc(text=response, prompt=input.query)
 if __name__ == "__main__":
    ollama_endpoint = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
    model_name = os.getenv("OLLAMA_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
    opea_microservices["opea_service@llm_ollama"].start()
--- a/comps/llms/text-generation/ollama/langchain/requirements-runtime.txt
+++ b/comps/llms/text-generation/ollama/langchain/requirements-runtime.txt
@@ -1 +0,0 @@
 langserve
--- a/comps/llms/text-generation/ollama/langchain/requirements.txt
+++ b/comps/llms/text-generation/ollama/langchain/requirements.txt
@@ -1,12 +0,0 @@
 docarray[full]
 fastapi
 huggingface_hub
 langchain
 langchain-community
 opentelemetry-api
 opentelemetry-exporter-otlp
 opentelemetry-sdk
 prometheus-fastapi-instrumentator
 shortuuid
 transformers
 uvicorn
--- a/tests/llms/test_llms_text-generation_service_ollama.sh
+++ b/tests/llms/test_llms_text-generation_service_ollama.sh
@@ -0,0 +1,78 @@
 #!/bin/bash
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 set -x
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
 ip_address=$(hostname -I | awk '{print $1}')
 ollama_endpoint_port=11435
 llm_port=9000
 function build_docker_images() {
    cd $WORKPATH
    docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/llm:comps -f comps/llms/src/text-generation/Dockerfile .
    if [ $? -ne 0 ]; then
        echo "opea/llm built fail"
        exit 1
    else
        echo "opea/llm built successful"
    fi
 }
 function start_service() {
    export llm_model=$1
    docker run -d --name="test-comps-llm-ollama-endpoint" -e https_proxy=$https_proxy -p $ollama_endpoint_port:11434 ollama/ollama
    export LLM_ENDPOINT="http://${ip_address}:${ollama_endpoint_port}"
    sleep 5s
    docker exec test-comps-llm-ollama-endpoint ollama pull $llm_model
    sleep 20s
    unset http_proxy
    docker run -d --name="test-comps-llm-ollama-server" -p $llm_port:9000 --ipc=host -e LOGFLAG=True -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT=$LLM_ENDPOINT -e LLM_MODEL_ID=$llm_model opea/llm:comps
    sleep 20s
 }
 function validate_microservice() {
    result=$(http_proxy="" curl http://${ip_address}:${llm_port}/v1/chat/completions \
        -X POST \
        -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
        -H 'Content-Type: application/json')
    if [[ $result == *"content"* ]]; then
        echo "Result correct."
    else
        echo "Result wrong. Received was $result"
        docker logs test-comps-llm-ollama-endpoint >> ${LOG_PATH}/llm-ollama.log
        docker logs test-comps-llm-ollama-server >> ${LOG_PATH}/llm-server.log
        exit 1
    fi
 }
 function stop_docker() {
    cid=$(docker ps -aq --filter "name=test-comps-llm-ollama*")
    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
 }
 function main() {
    stop_docker
    build_docker_images
    pip install --no-cache-dir openai
    llm_models=(
    llama3.2:1b
    )
    for model in "${llm_models[@]}"; do
      start_service "${model}"
      validate_microservice
      stop_docker
    done
    echo y | docker system prune
 }
 main
		`@@ -1,2 +0,0 @@`
			`# Copyright (C) 2024 Intel Corporation`
			`# SPDX-License-Identifier: Apache-2.0`