Refactor FaqGen (#1093)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-01-13 11:30:59 +08:00
parent 3f23bf582a
commit ea72c943bd
31 changed files with 962 additions and 551 deletions
--- a/comps/llms/deployment/docker_compose/faq-generation_tgi.yaml
+++ b/comps/llms/deployment/docker_compose/faq-generation_tgi.yaml
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  tgi-service:
+    image: ghcr.io/huggingface/text-generation-inference:2.4.0-intel-cpu
+    container_name: tgi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    shm_size: 1g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
+  llm:
+    image: opea/llm-faqgen:latest
+    container_name: llm-faqgen-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - ${FAQ_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
--- a/comps/llms/deployment/docker_compose/faq-generation_tgi_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/faq-generation_tgi_on_intel_hpu.yaml
@@ -0,0 +1,61 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  tgi-service:
+    image: ghcr.io/huggingface/tgi-gaudi:2.3.1
+    container_name: tgi-gaudi-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      HF_HUB_DISABLE_PROGRESS_BARS: 1
+      HF_HUB_ENABLE_HF_TRANSFER: 0
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      ENABLE_HPU_GRAPH: true
+      LIMIT_HPU_GRAPH: true
+      USE_FLASH_ATTENTION: true
+      FLASH_ATTENTION_RECOMPUTE: true
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
+  llm:
+    image: opea/llm-faqgen:latest
+    container_name: llm-faqgen-server
+    depends_on:
+      tgi-service:
+        condition: service_healthy
+    ports:
+      - ${FAQ_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
--- a/comps/llms/deployment/docker_compose/faq-generation_vllm.yaml
+++ b/comps/llms/deployment/docker_compose/faq-generation_vllm.yaml
@@ -0,0 +1,53 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  vllm-service:
+    image: opea/vllm:latest
+    container_name: vllm-server
+    ports:
+      - ${LLM_ENDPOINT_PORT:-8008}:80
+    volumes:
+      - "./data:/data"
+    shm_size: 128g
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
+  llm:
+    image: opea/llm-faqgen:latest
+    container_name: llm-faqgen-server
+    depends_on:
+      vllm-service:
+        condition: service_healthy
+    ports:
+      - ${FAQ_PORT:-9000}:9000
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
+      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
--- a/comps/llms/deployment/docker_compose/faq-generation_vllm_on_intel_hpu.yaml
+++ b/comps/llms/deployment/docker_compose/faq-generation_vllm_on_intel_hpu.yaml
@@ -8,37 +8,49 @@ services:
    image: opea/vllm-gaudi:latest
    container_name: vllm-gaudi-server
    ports:
-      - "8008:80"
+      - ${LLM_ENDPOINT_PORT:-8008}:80
    volumes:
      - "./data:/data"
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      HF_TOKEN: ${HF_TOKEN}
+      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
      HABANA_VISIBLE_DEVICES: all
      OMPI_MCA_btl_vader_single_copy_mechanism: none
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
+      host_ip: ${host_ip}
+      LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
+      VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-false}
    runtime: habana
    cap_add:
      - SYS_NICE
    ipc: host
-    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
+      interval: 10s
+      timeout: 10s
+      retries: 100
+    command: --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
  llm:
-    image: opea/llm-faqgen-vllm:latest
+    image: opea/llm-faqgen:latest
    container_name: llm-faqgen-server
    depends_on:
-      - vllm-service
+      vllm-service:
+        condition: service_healthy
    ports:
-      - "9000:9000"
+      - ${FAQ_PORT:-9000}:9000
    ipc: host
    environment:
      no_proxy: ${no_proxy}
      http_proxy: ${http_proxy}
      https_proxy: ${https_proxy}
-      vLLM_ENDPOINT: ${vLLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+      LLM_ENDPOINT: ${LLM_ENDPOINT}
      LLM_MODEL_ID: ${LLM_MODEL_ID}
+      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+      FAQGen_COMPONENT_NAME: ${FAQGen_COMPONENT_NAME}
+      LOGFLAG: ${LOGFLAG:-False}
    restart: unless-stopped

 networks:
--- a/comps/llms/faq-generation/tgi/langchain/README.md
+++ b/comps/llms/faq-generation/tgi/langchain/README.md
@@ -1,75 +0,0 @@
-# TGI FAQGen LLM Microservice
-
-This microservice interacts with the TGI LLM server to generate FAQs from Input Text.[Text Generation Inference](https://github.com/huggingface/text-generation-inference) (TGI) is a toolkit for deploying and serving Large Language Models (LLMs). TGI enables high-performance text generation for the most popular open-source LLMs, including Llama, Falcon, StarCoder, BLOOM, GPT-NeoX, and more.
-
-## 🚀1. Start Microservice with Docker
-
-If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a TGI service with docker.
-
-### 1.1 Setup Environment Variables
-
-In order to start TGI and LLM services, you need to setup the following environment variables first.
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-export TGI_LLM_ENDPOINT="http://${your_ip}:8008"
-export LLM_MODEL_ID=${your_hf_llm_model}
-```
-
-### 1.2 Build Docker Image
-
-```bash
-cd ../../../../../
-docker build -t opea/llm-faqgen-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/tgi/langchain/Dockerfile .
-```
-
-To start a docker container, you have two options:
-
- A. Run Docker with CLI
- B. Run Docker with Docker Compose
-
-You can choose one as needed.
-
-### 1.3 Run Docker with CLI (Option A)
-
-```bash
-docker run -d -p 8008:80 -v ./data:/data --name tgi_service --shm-size 1g ghcr.io/huggingface/text-generation-inference:1.4 --model-id ${LLM_MODEL_ID}
-```
-
-```bash
-docker run -d --name="llm-faqgen-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TGI_LLM_ENDPOINT=$TGI_LLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-tgi:latest
-```
-
-### 1.4 Run Docker with Docker Compose (Option B)
-
-```bash
-docker compose -f docker_compose_llm.yaml up -d
-```
-
-## 🚀3. Consume LLM Service
-
-### 3.1 Check Service Status
-
-```bash
-curl http://${your_ip}:9000/v1/health_check\
-  -X GET \
-  -H 'Content-Type: application/json'
-```
-
-### 3.2 Consume FAQGen LLM Service
-
-```bash
-# Streaming Response
-# Set stream to True. Default will be True.
-curl http://${your_ip}:9000/v1/faqgen \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-  -H 'Content-Type: application/json'
-
-# Non-Streaming Response
-# Set stream to False.
-curl http://${your_ip}:9000/v1/faqgen \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
-  -H 'Content-Type: application/json'
-```
--- a/comps/llms/faq-generation/tgi/langchain/docker_compose_llm.yaml
+++ b/comps/llms/faq-generation/tgi/langchain/docker_compose_llm.yaml
@@ -1,34 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-version: "3.8"
-
-services:
-  tgi_service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
-    container_name: tgi-service
-    ports:
-      - "8008:80"
-    volumes:
-      - "./data:/data"
-    environment:
-      HF_TOKEN: ${HF_TOKEN}
-    shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
-  llm:
-    image: opea/llm-faqgen-tgi:latest
-    container_name: llm-faqgen-server
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
-    restart: unless-stopped
-
-networks:
-  default:
-    driver: bridge
--- a/comps/llms/faq-generation/tgi/langchain/entrypoint.sh
+++ b/comps/llms/faq-generation/tgi/langchain/entrypoint.sh
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-pip --no-cache-dir install -r requirements-runtime.txt
-
-python llm.py
--- a/comps/llms/faq-generation/tgi/langchain/llm.py
+++ b/comps/llms/faq-generation/tgi/langchain/llm.py
@@ -1,100 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from fastapi.responses import StreamingResponse
-from langchain.chains.summarize import load_summarize_chain
-from langchain.docstore.document import Document
-from langchain.prompts import PromptTemplate
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.llms import HuggingFaceEndpoint
-
-from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
-from comps.cores.mega.utils import get_access_token
-
-logger = CustomLogger("llm_faqgen")
-logflag = os.getenv("LOGFLAG", False)
-
-# Environment variables
-TOKEN_URL = os.getenv("TOKEN_URL")
-CLIENTID = os.getenv("CLIENTID")
-CLIENT_SECRET = os.getenv("CLIENT_SECRET")
-
-
-def post_process_text(text: str):
-    if text == " ":
-        return "data: @#$\n\n"
-    if text == "\n":
-        return "data: <br/>\n\n"
-    if text.isspace():
-        return None
-    new_text = text.replace(" ", "@#$")
-    return f"data: {new_text}\n\n"
-
-
-@register_microservice(
-    name="opea_service@llm_faqgen",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/faqgen",
-    host="0.0.0.0",
-    port=9000,
-)
-async def llm_generate(input: LLMParamsDoc):
-    if logflag:
-        logger.info(input)
-    access_token = (
-        get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
-    )
-    server_kwargs = {}
-    if access_token:
-        server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
-    llm = HuggingFaceEndpoint(
-        endpoint_url=llm_endpoint,
-        max_new_tokens=input.max_tokens,
-        top_k=input.top_k,
-        top_p=input.top_p,
-        typical_p=input.typical_p,
-        temperature=input.temperature,
-        repetition_penalty=input.repetition_penalty,
-        streaming=input.stream,
-        server_kwargs=server_kwargs,
-    )
-    templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
-        TEXT: {text}
-        Do not use any prefix or suffix to the FAQ.
-    """
-    PROMPT = PromptTemplate.from_template(templ)
-    llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    texts = text_splitter.split_text(input.query)
-
-    # Create multiple documents
-    docs = [Document(page_content=t) for t in texts]
-
-    if input.stream:
-
-        async def stream_generator():
-            from langserve.serialization import WellKnownLCSerializer
-
-            _serializer = WellKnownLCSerializer()
-            async for chunk in llm_chain.astream_log(docs):
-                data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
-                if logflag:
-                    logger.info(data)
-                yield f"data: {data}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    else:
-        response = await llm_chain.ainvoke(docs)
-        response = response["output_text"]
-        if logflag:
-            logger.info(response)
-        return GeneratedDoc(text=response, prompt=input.query)
-
-
-if __name__ == "__main__":
-    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
-    # Split text
-    text_splitter = CharacterTextSplitter()
-    opea_microservices["opea_service@llm_faqgen"].start()
--- a/comps/llms/faq-generation/vllm/langchain/Dockerfile
+++ b/comps/llms/faq-generation/vllm/langchain/Dockerfile
@@ -1,25 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-FROM python:3.11-slim
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
-    libgl1-mesa-glx \
-    libjemalloc-dev
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-USER user
-
-COPY comps /home/user/comps
-
-RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r /home/user/comps/llms/faq-generation/vllm/langchain/requirements.txt
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user
-
-WORKDIR /home/user/comps/llms/faq-generation/vllm/langchain
-
-ENTRYPOINT ["bash", "entrypoint.sh"]
--- a/comps/llms/faq-generation/vllm/langchain/README.md
+++ b/comps/llms/faq-generation/vllm/langchain/README.md
@@ -1,77 +0,0 @@
-# vLLM FAQGen LLM Microservice
-
-This microservice interacts with the vLLM server to generate FAQs from Input Text.[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products).
-
-## 🚀1. Start Microservice with Docker
-
-If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a VLLM service with docker.
-
-To setup or build the vLLM image follow the instructions provided in [vLLM Gaudi](https://github.com/opea-project/GenAIComps/tree/main/comps/llms/text-generation/vllm/langchain#22-vllm-on-gaudi)
-
-### 1.1 Setup Environment Variables
-
-In order to start vLLM and LLM services, you need to setup the following environment variables first.
-
-```bash
-export HF_TOKEN=${your_hf_api_token}
-export vLLM_ENDPOINT="http://${your_ip}:8008"
-export LLM_MODEL_ID=${your_hf_llm_model}
-```
-
-### 1.3 Build Docker Image
-
-```bash
-cd ../../../../../
-docker build -t opea/llm-faqgen-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/faq-generation/vllm/langchain/Dockerfile .
-```
-
-To start a docker container, you have two options:
-
- A. Run Docker with CLI
- B. Run Docker with Docker Compose
-
-You can choose one as needed.
-
-### 1.3 Run Docker with CLI (Option A)
-
-```bash
-docker run -d -p 8008:80 -v ./data:/data --name vllm-service --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID}
-```
-
-```bash
-docker run -d --name="llm-faqgen-server" -p 9000:9000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e vLLM_ENDPOINT=$vLLM_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/llm-faqgen-vllm:latest
-```
-
-### 1.4 Run Docker with Docker Compose (Option B)
-
-```bash
-docker compose -f docker_compose_llm.yaml up -d
-```
-
-## 🚀3. Consume LLM Service
-
-### 3.1 Check Service Status
-
-```bash
-curl http://${your_ip}:9000/v1/health_check\
-  -X GET \
-  -H 'Content-Type: application/json'
-```
-
-### 3.2 Consume FAQGen LLM Service
-
-```bash
-# Streaming Response
-# Set stream to True. Default will be True.
-curl http://${your_ip}:9000/v1/faqgen \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-  -H 'Content-Type: application/json'
-
-# Non-Streaming Response
-# Set stream to False.
-curl http://${your_ip}:9000/v1/faqgen \
-  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
-  -H 'Content-Type: application/json'
-```
--- a/comps/llms/faq-generation/vllm/langchain/init.py
+++ b/comps/llms/faq-generation/vllm/langchain/init.py
@@ -1,2 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
--- a/comps/llms/faq-generation/vllm/langchain/llm.py
+++ b/comps/llms/faq-generation/vllm/langchain/llm.py
@@ -1,102 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from fastapi.responses import StreamingResponse
-from langchain.chains.summarize import load_summarize_chain
-from langchain.docstore.document import Document
-from langchain.prompts import PromptTemplate
-from langchain.text_splitter import CharacterTextSplitter
-from langchain_community.llms import VLLMOpenAI
-
-from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, ServiceType, opea_microservices, register_microservice
-from comps.cores.mega.utils import get_access_token
-
-logger = CustomLogger("llm_faqgen")
-logflag = os.getenv("LOGFLAG", False)
-
-# Environment variables
-TOKEN_URL = os.getenv("TOKEN_URL")
-CLIENTID = os.getenv("CLIENTID")
-CLIENT_SECRET = os.getenv("CLIENT_SECRET")
-
-
-def post_process_text(text: str):
-    if text == " ":
-        return "data: @#$\n\n"
-    if text == "\n":
-        return "data: <br/>\n\n"
-    if text.isspace():
-        return None
-    new_text = text.replace(" ", "@#$")
-    return f"data: {new_text}\n\n"
-
-
-@register_microservice(
-    name="opea_service@llm_faqgen",
-    service_type=ServiceType.LLM,
-    endpoint="/v1/faqgen",
-    host="0.0.0.0",
-    port=9000,
-)
-async def llm_generate(input: LLMParamsDoc):
-    if logflag:
-        logger.info(input)
-    access_token = (
-        get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
-    )
-    headers = {}
-    if access_token:
-        headers = {"Authorization": f"Bearer {access_token}"}
-
-    model = input.model if input.model else os.getenv("LLM_MODEL_ID")
-    llm = VLLMOpenAI(
-        openai_api_key="EMPTY",
-        openai_api_base=llm_endpoint + "/v1",
-        model_name=model,
-        default_headers=headers,
-        max_tokens=input.max_tokens,
-        top_p=input.top_p,
-        streaming=input.stream,
-        temperature=input.temperature,
-    )
-
-    templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
-        TEXT: {text}
-        Do not use any prefix or suffix to the FAQ.
-    """
-    PROMPT = PromptTemplate.from_template(templ)
-    llm_chain = load_summarize_chain(llm=llm, prompt=PROMPT)
-    texts = text_splitter.split_text(input.query)
-
-    # Create multiple documents
-    docs = [Document(page_content=t) for t in texts]
-
-    if input.stream:
-
-        async def stream_generator():
-            from langserve.serialization import WellKnownLCSerializer
-
-            _serializer = WellKnownLCSerializer()
-            async for chunk in llm_chain.astream_log(docs):
-                data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
-                if logflag:
-                    logger.info(data)
-                yield f"data: {data}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(stream_generator(), media_type="text/event-stream")
-    else:
-        response = await llm_chain.ainvoke(docs)
-        response = response["output_text"]
-        if logflag:
-            logger.info(response)
-        return GeneratedDoc(text=response, prompt=input.query)
-
-
-if __name__ == "__main__":
-    llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
-    # Split text
-    text_splitter = CharacterTextSplitter()
-    opea_microservices["opea_service@llm_faqgen"].start()
--- a/comps/llms/faq-generation/vllm/langchain/requirements-runtime.txt
+++ b/comps/llms/faq-generation/vllm/langchain/requirements-runtime.txt
@@ -1 +0,0 @@
-langserve
--- a/comps/llms/faq-generation/vllm/langchain/requirements.txt
+++ b/comps/llms/faq-generation/vllm/langchain/requirements.txt
@@ -1,15 +0,0 @@
-docarray[full]
-fastapi
-huggingface_hub
-langchain
-langchain-huggingface
-langchain-openai
-langchain_community
-langchainhub
-opentelemetry-api
-opentelemetry-exporter-otlp
-opentelemetry-sdk
-prometheus-fastapi-instrumentator
-shortuuid
-transformers
-uvicorn
--- a/comps/llms/faq-generation/tgi/langchain/Dockerfile
+++ b/comps/llms/faq-generation/tgi/langchain/Dockerfile
@@ -16,10 +16,10 @@ USER user
 COPY comps /home/user/comps

 RUN pip install --no-cache-dir --upgrade pip setuptools && \
-    pip install --no-cache-dir -r /home/user/comps/llms/faq-generation/tgi/langchain/requirements.txt
+    pip install --no-cache-dir -r /home/user/comps/llms/src/faq-generation/requirements.txt

 ENV PYTHONPATH=$PYTHONPATH:/home/user

-WORKDIR /home/user/comps/llms/faq-generation/tgi/langchain
+WORKDIR /home/user/comps/llms/src/faq-generation

 ENTRYPOINT ["bash", "entrypoint.sh"]
--- a/comps/llms/src/faq-generation/README.md
+++ b/comps/llms/src/faq-generation/README.md
@@ -0,0 +1,110 @@
+# FAQGen LLM Microservice
+
+This microservice interacts with the TGI/vLLM LLM server to generate FAQs(frequently asked questions and answers) from Input Text. You can set backend service either [TGI](../../../third_parties/tgi) or [vLLM](../../../third_parties/vllm).
+
+## 🚀1. Start Microservice with Docker
+
+### 1.1 Setup Environment Variables
+
+In order to start FaqGen microservices, you need to setup the following environment variables first.
+
+```bash
+export host_ip=${your_host_ip}
+export LLM_ENDPOINT_PORT=8008
+export FAQ_PORT=9000
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}"
+export LLM_MODEL_ID=${your_hf_llm_model}
+export FAQGen_COMPONENT_NAME="OPEAFAQGen_TGI" # or "vllm"
+```
+
+### 1.2 Build Docker Image
+
+Step 1: Prepare backend LLM docker image.
+
+If you want to use vLLM backend, refer to [vLLM](../../../third_parties/vllm/src) to build vLLM docker images first.
+
+No need for TGI.
+
+Step 2: Build FaqGen docker image.
+
+```bash
+cd ../../../../
+docker build -t opea/llm-faqgen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/src/faq-generation/Dockerfile .
+```
+
+### 1.3 Run Docker
+
+To start a docker container, you have two options:
+
+- A. Run Docker with CLI
+- B. Run Docker with Docker Compose
+
+You can choose one as needed.
+
+#### 1.3.1 Run Docker with CLI (Option A)
+
+Step 1: Start the backend LLM service
+Please refer to [TGI](../../../third_parties/tgi/deployment/docker_compose/) or [vLLM](../../../third_parties/vllm/deployment/docker_compose/) guideline to start a backend LLM service.
+
+Step 2: Start the FaqGen microservices
+
+```bash
+docker run -d \
+    --name="llm-faqgen-server" \
+    -p 9000:9000 \
+    --ipc=host \
+    -e http_proxy=$http_proxy \
+    -e https_proxy=$https_proxy \
+    -e LLM_MODEL_ID=$LLM_MODEL_ID \
+    -e LLM_ENDPOINT=$LLM_ENDPOINT \
+    -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \
+    -e FAQGen_COMPONENT_NAME=$FAQGen_COMPONENT_NAME \
+    opea/llm-faqgen:latest
+```
+
+#### 1.3.2 Run Docker with Docker Compose (Option B)
+
+```bash
+cd ../../deployment/docker_compose/
+
+# Backend is TGI on xeon
+docker compose -f faq-generation_tgi.yaml up -d
+
+# Backend is TGI on gaudi
+# docker compose -f faq-generation_tgi_on_intel_hpu.yaml up -d
+
+# Backend is vLLM on xeon
+# docker compose -f faq-generation_vllm.yaml up -d
+
+# Backend is vLLM on gaudi
+# docker compose -f faq-generation_vllm_on_intel_hpu.yaml up -d
+```
+
+## 🚀2. Consume LLM Service
+
+### 2.1 Check Service Status
+
+```bash
+curl http://${host_ip}:${FAQ_PORT}/v1/health_check\
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+### 2.2 Consume FAQGen LLM Service
+
+```bash
+# Streaming Response
+# Set stream to True. Default will be True.
+curl http://${host_ip}:${FAQ_PORT}/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 128}' \
+  -H 'Content-Type: application/json'
+
+# Non-Streaming Response
+# Set stream to False.
+curl http://${host_ip}:${FAQ_PORT}/v1/faqgen \
+  -X POST \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.","max_tokens": 128, "stream":false}' \
+  -H 'Content-Type: application/json'
+```
--- a/comps/llms/faq-generation/vllm/langchain/entrypoint.sh
+++ b/comps/llms/faq-generation/vllm/langchain/entrypoint.sh
@@ -5,4 +5,4 @@

 pip --no-cache-dir install -r requirements-runtime.txt

-python llm.py
+python opea_faqgen_microservice.py
--- a/comps/llms/src/faq-generation/integrations/init.py
+++ b/comps/llms/src/faq-generation/integrations/init.py
--- a/comps/llms/src/faq-generation/integrations/common.py
+++ b/comps/llms/src/faq-generation/integrations/common.py
@@ -0,0 +1,110 @@
+# Copyright (C) 2024 Prediction Guard, Inc.
+# SPDX-License-Identified: Apache-2.0
+
+import os
+
+import requests
+from fastapi.responses import StreamingResponse
+from langchain.chains.summarize import load_summarize_chain
+from langchain.docstore.document import Document
+from langchain.text_splitter import CharacterTextSplitter
+from langchain_core.prompts import PromptTemplate
+
+from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, ServiceType
+from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs
+
+logger = CustomLogger("opea_faqgen")
+logflag = os.getenv("LOGFLAG", False)
+
+templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
+            TEXT: {text}
+            Do not use any prefix or suffix to the FAQ.
+        """
+
+# Environment variables
+MODEL_NAME = os.getenv("LLM_MODEL_ID")
+MODEL_CONFIGS = os.getenv("MODEL_CONFIGS")
+TOKEN_URL = os.getenv("TOKEN_URL")
+CLIENTID = os.getenv("CLIENTID")
+CLIENT_SECRET = os.getenv("CLIENT_SECRET")
+
+if os.getenv("LLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("LLM_ENDPOINT")
+elif os.getenv("TGI_LLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT")
+elif os.getenv("vLLM_ENDPOINT") is not None:
+    DEFAULT_ENDPOINT = os.getenv("vLLM_ENDPOINT")
+else:
+    DEFAULT_ENDPOINT = "http://localhost:8080"
+
+
+def get_llm_endpoint():
+    if not MODEL_CONFIGS:
+        return DEFAULT_ENDPOINT
+    else:
+        # Validate and Load the models config if MODEL_CONFIGS is not null
+        configs_map = {}
+        try:
+            configs_map = load_model_configs(MODEL_CONFIGS)
+        except ConfigError as e:
+            logger.error(f"Failed to load model configurations: {e}")
+            raise ConfigError(f"Failed to load model configurations: {e}")
+        try:
+            return configs_map.get(MODEL_NAME).get("endpoint")
+        except ConfigError as e:
+            logger.error(f"Input model {MODEL_NAME} not present in model_configs. Error {e}")
+            raise ConfigError(f"Input model {MODEL_NAME} not present in model_configs")
+
+
+class OPEAFAQGen(OpeaComponent):
+    """A specialized OPEA FAQGen component derived from OpeaComponent.
+
+    Attributes:
+        client (TGI/vLLM): An instance of the TGI/vLLM client for text generation.
+    """
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.LLM.name.lower(), description, config)
+        self.access_token = (
+            get_access_token(TOKEN_URL, CLIENTID, CLIENT_SECRET) if TOKEN_URL and CLIENTID and CLIENT_SECRET else None
+        )
+        self.text_splitter = CharacterTextSplitter()
+        self.llm_endpoint = get_llm_endpoint()
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OPEAFAQGen health check failed.")
+
+    async def generate(self, input: LLMParamsDoc, client):
+        """Invokes the TGI/vLLM LLM service to generate FAQ output for the provided input.
+
+        Args:
+            input (LLMParamsDoc): The input text(s).
+            client: TGI/vLLM based client
+        """
+        PROMPT = PromptTemplate.from_template(templ)
+        llm_chain = load_summarize_chain(llm=client, prompt=PROMPT)
+        texts = self.text_splitter.split_text(input.query)
+
+        # Create multiple documents
+        docs = [Document(page_content=t) for t in texts]
+
+        if input.stream:
+
+            async def stream_generator():
+                from langserve.serialization import WellKnownLCSerializer
+
+                _serializer = WellKnownLCSerializer()
+                async for chunk in llm_chain.astream_log(docs):
+                    data = _serializer.dumps({"ops": chunk.ops}).decode("utf-8")
+                    if logflag:
+                        logger.info(data)
+                    yield f"data: {data}\n\n"
+                yield "data: [DONE]\n\n"
+
+            return StreamingResponse(stream_generator(), media_type="text/event-stream")
+        else:
+            response = await llm_chain.ainvoke(docs)
+            response = response["output_text"]
+            if logflag:
+                logger.info(response)
+            return GeneratedDoc(text=response, prompt=input.query)
--- a/comps/llms/src/faq-generation/integrations/tgi.py
+++ b/comps/llms/src/faq-generation/integrations/tgi.py
@@ -0,0 +1,73 @@
+# Copyright (C) 2024 Prediction Guard, Inc.
+# SPDX-License-Identified: Apache-2.0
+
+import os
+
+import requests
+from langchain_community.llms import HuggingFaceEndpoint
+
+from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+
+from .common import *
+
+logger = CustomLogger("opea_faqgen_tgi")
+logflag = os.getenv("LOGFLAG", False)
+
+
+@OpeaComponentRegistry.register("OPEAFAQGen_TGI")
+class OPEAFAQGen_TGI(OPEAFAQGen):
+    """A specialized OPEA FAQGen TGI component derived from OPEAFAQGen for interacting with TGI services based on Lanchain HuggingFaceEndpoint API.
+
+    Attributes:
+        client (TGI): An instance of the TGI client for text generation.
+    """
+
+    def check_health(self) -> bool:
+        """Checks the health of the TGI LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            # response = requests.get(f"{self.llm_endpoint}/health")
+
+            # Will remove after TGI gaudi fix health bug
+            url = f"{self.llm_endpoint}/generate"
+            data = {"inputs": "What is Deep Learning?", "parameters": {"max_new_tokens": 17}}
+            headers = {"Content-Type": "application/json"}
+            response = requests.post(url=url, json=data, headers=headers)
+
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: LLMParamsDoc):
+        """Invokes the TGI LLM service to generate FAQ output for the provided input.
+
+        Args:
+            input (LLMParamsDoc): The input text(s).
+        """
+        server_kwargs = {}
+        if self.access_token:
+            server_kwargs["headers"] = {"Authorization": f"Bearer {self.access_token}"}
+
+        self.client = HuggingFaceEndpoint(
+            endpoint_url=self.llm_endpoint,
+            max_new_tokens=input.max_tokens,
+            top_k=input.top_k,
+            top_p=input.top_p,
+            typical_p=input.typical_p,
+            temperature=input.temperature,
+            repetition_penalty=input.repetition_penalty,
+            streaming=input.stream,
+            server_kwargs=server_kwargs,
+        )
+        result = await self.generate(input, self.client)
+
+        return result
--- a/comps/llms/src/faq-generation/integrations/vllm.py
+++ b/comps/llms/src/faq-generation/integrations/vllm.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2024 Prediction Guard, Inc.
+# SPDX-License-Identified: Apache-2.0
+
+import os
+
+import requests
+from langchain_community.llms import VLLMOpenAI
+
+from comps import CustomLogger, GeneratedDoc, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+
+from .common import *
+
+logger = CustomLogger("opea_faqgen_vllm")
+logflag = os.getenv("LOGFLAG", False)
+
+
+@OpeaComponentRegistry.register("OPEAFAQGen_vLLM")
+class OPEAFAQGen_vLLM(OPEAFAQGen):
+    """A specialized OPEA FAQGen vLLM component derived from OPEAFAQGen for interacting with vLLM services based on Lanchain VLLMOpenAI API.
+
+    Attributes:
+        client (vLLM): An instance of the vLLM client for text generation.
+    """
+
+    def check_health(self) -> bool:
+        """Checks the health of the vLLM LLM service.
+
+        Returns:
+            bool: True if the service is reachable and healthy, False otherwise.
+        """
+
+        try:
+            response = requests.get(f"{self.llm_endpoint}/health")
+            if response.status_code == 200:
+                return True
+            else:
+                return False
+        except Exception as e:
+            logger.error(e)
+            logger.error("Health check failed")
+            return False
+
+    async def invoke(self, input: LLMParamsDoc):
+        """Invokes the vLLM LLM service to generate FAQ output for the provided input.
+
+        Args:
+            input (LLMParamsDoc): The input text(s).
+        """
+        headers = {}
+        if self.access_token:
+            headers = {"Authorization": f"Bearer {self.access_token}"}
+
+        self.client = VLLMOpenAI(
+            openai_api_key="EMPTY",
+            openai_api_base=self.llm_endpoint + "/v1",
+            model_name=MODEL_NAME,
+            default_headers=headers,
+            max_tokens=input.max_tokens,
+            top_p=input.top_p,
+            streaming=input.stream,
+            temperature=input.temperature,
+        )
+        result = await self.generate(input, self.client)
+
+        return result
--- a/comps/llms/src/faq-generation/opea_faqgen_microservice.py
+++ b/comps/llms/src/faq-generation/opea_faqgen_microservice.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import time
+
+from integrations.tgi import OPEAFAQGen_TGI
+from integrations.vllm import OPEAFAQGen_vLLM
+
+from comps import (
+    CustomLogger,
+    LLMParamsDoc,
+    OpeaComponentLoader,
+    ServiceType,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+
+logger = CustomLogger("llm_faqgen")
+logflag = os.getenv("LOGFLAG", False)
+
+llm_component_name = os.getenv("FAQGen_COMPONENT_NAME", "OPEAFAQGen_TGI")
+# Initialize OpeaComponentLoader
+loader = OpeaComponentLoader(llm_component_name, description=f"OPEA LLM FAQGen Component: {llm_component_name}")
+
+
+@register_microservice(
+    name="opea_service@llm_faqgen",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/faqgen",
+    host="0.0.0.0",
+    port=9000,
+)
+@register_statistics(names=["opea_service@llm_faqgen"])
+async def llm_generate(input: LLMParamsDoc):
+    start = time.time()
+
+    # Log the input if logging is enabled
+    if logflag:
+        logger.info(input)
+
+    try:
+        # Use the controller to invoke the active component
+        response = await loader.invoke(input)
+        # Record statistics
+        statistics_dict["opea_service@llm_faqgen"].append_latency(time.time() - start, None)
+        return response
+
+    except Exception as e:
+        logger.error(f"Error during FaqGen invocation: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    logger.info("OPEA FAQGen Microservice is starting...")
+    opea_microservices["opea_service@llm_faqgen"].start()
--- a/comps/llms/faq-generation/tgi/langchain/requirements-runtime.txt
+++ b/comps/llms/faq-generation/tgi/langchain/requirements-runtime.txt
--- a/comps/llms/faq-generation/tgi/langchain/requirements.txt
+++ b/comps/llms/faq-generation/tgi/langchain/requirements.txt
--- a/comps/third_parties/vllm/deployment/docker_compose/build_docker_vllm.sh
+++ b/comps/third_parties/vllm/deployment/docker_compose/build_docker_vllm.sh
--- a/comps/third_parties/vllm/deployment/docker_compose/build_docker_vllm_openvino.sh
+++ b/comps/third_parties/vllm/deployment/docker_compose/build_docker_vllm_openvino.sh