textgen ollama code refactor. (#1158)

Remove Ollama folder since default openai API is able to consume Ollama service, modified Ollama readme and added UT. #998 Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
2025-01-17 16:49:31 +08:00
parent ff094b555c
commit 28f9c31129
11 changed files with 83 additions and 118 deletions
--- a/.github/workflows/docker/compose/llms-compose.yaml
+++ b/.github/workflows/docker/compose/llms-compose.yaml
@@ -11,10 +11,6 @@ services:
    build:
      dockerfile: comps/llms/src/text-generation/Dockerfile.intel_hpu
    image: ${REGISTRY:-opea}/llm-textgen-gaudi:${TAG:-latest}
-  llm-ollama:
-    build:
-      dockerfile: comps/llms/text-generation/ollama/langchain/Dockerfile
-    image: ${REGISTRY:-opea}/llm-ollama:${TAG:-latest}
  llm-docsum:
    build:
      dockerfile: comps/llms/src/doc-summarization/Dockerfile
--- a/comps/finetuning/src/README.md
+++ b/comps/finetuning/src/README.md
@@ -244,7 +244,7 @@ curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Typ

 ### 3.4 Leverage fine-tuned model

-After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.
+After fine-tuning job is done, fine-tuned model can be chosen from listed checkpoints, then the fine-tuned model can be used in other microservices. For example, fine-tuned reranking model can be used in [reranks](../../rerankings/src/README.md) microservice by assign its path to the environment variable `RERANK_MODEL_ID`, fine-tuned embedding model can be used in [embeddings](../../embeddings/src/README.md) microservice by assign its path to the environment variable `model`, LLMs after instruction tuning can be used in [llms](../../llms/src/text-generation/README.md) microservice by assign its path to the environment variable `your_hf_llm_model`.

 ## 🚀4. Descriptions for Finetuning parameters

--- a/comps/llms/text-generation/ollama/langchain/README.md
+++ b/comps/llms/text-generation/ollama/langchain/README.md
@@ -57,18 +57,18 @@ curl --noproxy "*" http://localhost:11434/api/generate -d '{
 ## Build Docker Image

 ```bash
-cd GenAIComps/
-docker build --no-cache -t opea/llm-ollama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ollama/langchain/Dockerfile .
+cd ../../../../
+docker build -t opea/llm-textgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/text-generation/Dockerfile .
 ```

 ## Run the Ollama Microservice

 ```bash
-docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/llm-ollama:latest
+docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT="http://localhost:11434" -e LLM_MODEL_ID="llama3" opea/llm-textgen:latest
 ```

 ## Consume the Ollama Microservice

 ```bash
-curl http://127.0.0.1:9000/v1/chat/completions  -X POST   -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}'   -H 'Content-Type: application/json'
+curl http://127.0.0.1:9000/v1/chat/completions  -X POST   -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' -H 'Content-Type: application/json'
 ```
--- a/comps/llms/src/text-generation/README_textgen.md
+++ b/comps/llms/src/text-generation/README_textgen.md
--- a/comps/llms/text-generation/ollama/langchain/Dockerfile
+++ b/comps/llms/text-generation/ollama/langchain/Dockerfile
@@ -1,26 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-FROM python:3.11-slim
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
-    curl \
-    libgl1-mesa-glx \
-    libjemalloc-dev
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-USER user
-
-COPY comps /home/user/comps
-
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r /home/user/comps/llms/text-generation/ollama/langchain/requirements.txt
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user
-
-WORKDIR /home/user/comps/llms/text-generation/ollama/langchain
-
-ENTRYPOINT ["bash", "entrypoint.sh"]
--- a/comps/llms/text-generation/ollama/langchain/init.py
+++ b/comps/llms/text-generation/ollama/langchain/init.py
@@ -1,2 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
--- a/comps/llms/text-generation/ollama/langchain/entrypoint.sh
+++ b/comps/llms/text-generation/ollama/langchain/entrypoint.sh
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-pip --no-cache-dir install -r requirements-runtime.txt
-
-python llm.py
--- a/comps/llms/text-generation/ollama/langchain/llm.py
+++ b/comps/llms/text-generation/ollama/langchain/llm.py
@@ -1,60 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from fastapi.responses import StreamingResponse
-from langchain_community.llms import Ollama
-
-from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
-
-logger = CustomLogger("llm_ollama")
-logflag = os.getenv("LOGFLAG", False)
-
-
-@register_microservice(
-    name="opea_service@llm_ollama",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/chat/completions",
-    host="0.0.0.0",
-    port=9000,
-)
-async def llm_generate(input: LLMParamsDoc):
-    if logflag:
-        logger.info(input)
-    ollama = Ollama(
-        base_url=ollama_endpoint,
-        model=input.model if input.model else model_name,
-        num_predict=input.max_tokens,
-        top_k=input.top_k,
-        top_p=input.top_p,
-        temperature=input.temperature,
-        repeat_penalty=input.repetition_penalty,
-    )
-    # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3`
-    if input.stream:
-
-        async def stream_generator():
-            chat_response = ""
-            async for text in ollama.astream(input.query):
-                chat_response += text
-                chunk_repr = repr(text.encode("utf-8"))
-                if logflag:
-                    logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
-                yield f"data: {chunk_repr}\n\n"
-            if logflag:
-                logger.info(f"[llm - chat_stream] stream response: {chat_response}")
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    else:
-        response = await ollama.ainvoke(input.query)
-        if logflag:
-            logger.info(response)
-        return GeneratedDoc(text=response, prompt=input.query)
-
-
-if __name__ == "__main__":
-    ollama_endpoint = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
-    model_name = os.getenv("OLLAMA_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")
-    opea_microservices["opea_service@llm_ollama"].start()
--- a/comps/llms/text-generation/ollama/langchain/requirements-runtime.txt
+++ b/comps/llms/text-generation/ollama/langchain/requirements-runtime.txt
@@ -1 +0,0 @@
-langserve
--- a/comps/llms/text-generation/ollama/langchain/requirements.txt
+++ b/comps/llms/text-generation/ollama/langchain/requirements.txt
@@ -1,12 +0,0 @@
-docarray[full]
-fastapi
-huggingface_hub
-langchain
-langchain-community
-opentelemetry-api
-opentelemetry-exporter-otlp
-opentelemetry-sdk
-prometheus-fastapi-instrumentator
-shortuuid
-transformers
-uvicorn
--- a/tests/llms/test_llms_text-generation_service_ollama.sh
+++ b/tests/llms/test_llms_text-generation_service_ollama.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+ollama_endpoint_port=11435
+llm_port=9000
+
+function build_docker_images() {
+    cd $WORKPATH
+    docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/llm:comps -f comps/llms/src/text-generation/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/llm built fail"
+        exit 1
+    else
+        echo "opea/llm built successful"
+    fi
+}
+
+function start_service() {
+    export llm_model=$1
+    docker run -d --name="test-comps-llm-ollama-endpoint" -e https_proxy=$https_proxy -p $ollama_endpoint_port:11434 ollama/ollama
+    export LLM_ENDPOINT="http://${ip_address}:${ollama_endpoint_port}"
+
+    sleep 5s
+    docker exec test-comps-llm-ollama-endpoint ollama pull $llm_model
+    sleep 20s
+
+    unset http_proxy
+    docker run -d --name="test-comps-llm-ollama-server" -p $llm_port:9000 --ipc=host -e LOGFLAG=True -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLM_ENDPOINT=$LLM_ENDPOINT -e LLM_MODEL_ID=$llm_model opea/llm:comps
+    sleep 20s
+}
+
+function validate_microservice() {
+    result=$(http_proxy="" curl http://${ip_address}:${llm_port}/v1/chat/completions \
+        -X POST \
+        -d '{"messages": [{"role": "user", "content": "What is Deep Learning?"}]}' \
+        -H 'Content-Type: application/json')
+    if [[ $result == *"content"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs test-comps-llm-ollama-endpoint >> ${LOG_PATH}/llm-ollama.log
+        docker logs test-comps-llm-ollama-server >> ${LOG_PATH}/llm-server.log
+        exit 1
+    fi
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=test-comps-llm-ollama*")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+    build_docker_images
+
+    pip install --no-cache-dir openai
+
+    llm_models=(
+    llama3.2:1b
+    )
+    for model in "${llm_models[@]}"; do
+      start_service "${model}"
+      validate_microservice
+      stop_docker
+    done
+
+    echo y | docker system prune
+
+}
+
+main