Rename streaming to stream to align with OpenAI API (#1098)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
2025-01-06 13:25:47 +08:00
parent 6419ace56c
commit 679e6664d4
41 changed files with 90 additions and 90 deletions
--- a/comps/agent/langchain/agent.py
+++ b/comps/agent/langchain/agent.py
@@ -56,7 +56,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, AgentCo
    if logflag:
        logger.info(input)

-    input.streaming = args.streaming
+    input.stream = args.stream
    config = {"recursion_limit": args.recursion_limit}

    if args.with_memory:
@@ -79,7 +79,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, AgentCo
            input_query = input.messages[-1]["content"]

    # 2. prepare the input for the agent
-    if input.streaming:
+    if input.stream:
        logger.info("-----------STREAMING-------------")
        return StreamingResponse(agent_inst.stream_generator(input_query, config), media_type="text/event-stream")

--- a/comps/agent/langchain/src/config.py
+++ b/comps/agent/langchain/src/config.py
@@ -40,8 +40,8 @@ if os.environ.get("role_description") is not None:
 if os.environ.get("tools") is not None:
    env_config += ["--tools", os.environ["tools"]]

-if os.environ.get("streaming") is not None:
-    env_config += ["--streaming", os.environ["streaming"]]
+if os.environ.get("stream") is not None:
+    env_config += ["--stream", os.environ["stream"]]

 if os.environ.get("max_new_tokens") is not None:
    env_config += ["--max_new_tokens", os.environ["max_new_tokens"]]
--- a/comps/agent/langchain/src/strategy/sqlagent/README.md
+++ b/comps/agent/langchain/src/strategy/sqlagent/README.md
@@ -38,7 +38,7 @@ The agent node takes user question, hints (optional) and history (when available
 ## Limitations

 1. Agent is only allowed to issue "SELECT" commands to databases, i.e., agent can only query databases but cannot update databases.
-2. We currently does not support "streaming" agent outputs on the fly for `sql_agent_llama`.
+2. We currently does not support "stream" agent outputs on the fly for `sql_agent_llama`.
 3. Users need to pass the SQL database URI to the agent with the `db_path` environment variable. We have only validated SQLite database connected in such way.

 Please submit issues if you want new features to be added. We also welcome community contributions!
--- a/comps/agent/langchain/src/utils.py
+++ b/comps/agent/langchain/src/utils.py
@@ -35,7 +35,7 @@ def setup_hf_tgi_client(args):
        "temperature": args.temperature,
        "repetition_penalty": args.repetition_penalty,
        "return_full_text": args.return_full_text,
-        "streaming": args.streaming,
+        "stream": args.stream,
    }

    llm = HuggingFaceEndpoint(
@@ -53,7 +53,7 @@ def setup_chat_model(args):
        "temperature": args.temperature,
        "max_tokens": args.max_new_tokens,
        "top_p": args.top_p,
-        "streaming": args.streaming,
+        "stream": args.stream,
    }
    if args.llm_engine == "vllm" or args.llm_engine == "tgi":
        openai_endpoint = f"{args.llm_endpoint_url}/v1"
@@ -115,7 +115,7 @@ def adapt_custom_prompt(local_vars, custom_prompt):
 def get_args():
    parser = argparse.ArgumentParser()
    # llm args
-    parser.add_argument("--streaming", type=str, default="true")
+    parser.add_argument("--stream", type=str, default="true")
    parser.add_argument("--port", type=int, default=9090)
    parser.add_argument("--agent_name", type=str, default="OPEA_Default_Agent")
    parser.add_argument("--strategy", type=str, default="react_langchain")
@@ -153,10 +153,10 @@ def get_args():
        for key, value in vars(env_args).items():
            setattr(sys_args, key, value)

-    if sys_args.streaming == "true":
-        sys_args.streaming = True
+    if sys_args.stream == "true":
+        sys_args.stream = True
    else:
-        sys_args.streaming = False
+        sys_args.stream = False

    if sys_args.use_hints == "true":
        print("SQL agent will use hints")
--- a/comps/cores/mega/orchestrator.py
+++ b/comps/cores/mega/orchestrator.py
@@ -121,7 +121,7 @@ class ServiceOrchestrator(DAG):
                                        downstreams.remove(downstream)
                                except re.error as e:
                                    logger.error("Pattern invalid! Operation cancelled.")
-                            if len(downstreams) == 0 and llm_parameters.streaming:
+                            if len(downstreams) == 0 and llm_parameters.stream:
                                # turn the response to a StreamingResponse
                                # to make the response uniform to UI
                                def fake_stream(text):
@@ -153,7 +153,7 @@ class ServiceOrchestrator(DAG):
            if node not in nodes_to_keep:
                runtime_graph.delete_node_if_exists(node)

-        if not llm_parameters.streaming:
+        if not llm_parameters.stream:
            self.metrics.pending_update(False)

        return result_dict, runtime_graph
@@ -189,7 +189,7 @@ class ServiceOrchestrator(DAG):
        # pre-process
        inputs = self.align_inputs(inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs)

-        if is_llm_vlm and llm_parameters.streaming:
+        if is_llm_vlm and llm_parameters.stream:
            # Still leave to sync requests.post for StreamingResponse
            if LOGFLAG:
                logger.info(inputs)
@@ -203,7 +203,7 @@ class ServiceOrchestrator(DAG):
            )
            downstream = runtime_graph.downstream(cur_node)
            if downstream:
-                assert len(downstream) == 1, "Not supported multiple streaming downstreams yet!"
+                assert len(downstream) == 1, "Not supported multiple stream downstreams yet!"
                cur_node = downstream[0]
                hitted_ends = [".", "?", "!", "。", "，", "！"]
                downstream_endpoint = self.services[downstream[0]].endpoint_path
--- a/comps/cores/proto/docarray.py
+++ b/comps/cores/proto/docarray.py
@@ -150,7 +150,7 @@ class LVMSearchedMultimodalDoc(SearchedMultimodalDoc):
    top_p: float = 0.95
    typical_p: float = 0.95
    temperature: float = 0.01
-    streaming: bool = False
+    stream: bool = False
    repetition_penalty: float = 1.03
    chat_template: Optional[str] = Field(
        default=None,
@@ -184,7 +184,7 @@ class LLMParamsDoc(BaseDoc):
    frequency_penalty: float = 0.0
    presence_penalty: float = 0.0
    repetition_penalty: float = 1.03
-    streaming: bool = True
+    stream: bool = True
    language: str = "auto"  # can be "en", "zh"

    chat_template: Optional[str] = Field(
@@ -229,7 +229,7 @@ class LLMParams(BaseDoc):
    frequency_penalty: float = 0.0
    presence_penalty: float = 0.0
    repetition_penalty: float = 1.03
-    streaming: bool = True
+    stream: bool = True
    language: str = "auto"  # can be "en", "zh"

    chat_template: Optional[str] = Field(
@@ -292,7 +292,7 @@ class LVMDoc(BaseDoc):
    typical_p: float = 0.95
    temperature: float = 0.01
    repetition_penalty: float = 1.03
-    streaming: bool = False
+    stream: bool = False


 class LVMVideoDoc(BaseDoc):
--- a/comps/cores/telemetry/README.md
+++ b/comps/cores/telemetry/README.md
@@ -52,7 +52,7 @@ Applications' megaservice `ServiceOrchectrator` provides following metrics:

 Latency ones are histogram metrics i.e. include count, total value and set of value buckets for each item.

-They are available only for _streaming_ requests using LLM. Pending count accounts for all requests.
+They are available only for _stream_ requests using LLM. Pending count accounts for all requests.

 ### Inferencing Metrics

--- a/comps/intent_detection/langchain/README.md
+++ b/comps/intent_detection/langchain/README.md
@@ -83,6 +83,6 @@ Once intent detection microservice is started, user can use below command to inv
 ```bash
 curl http://${your_ip}:9000/v1/chat/intent\
  -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":10,"top_k":1,"temperature":0.001,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_new_tokens":10,"top_k":1,"temperature":0.001,"stream":false}' \
  -H 'Content-Type: application/json'
 ```
--- a/comps/intent_detection/langchain/intent_detection.py
+++ b/comps/intent_detection/langchain/intent_detection.py
@@ -26,7 +26,7 @@ async def llm_generate(input: LLMParamsDoc):
        typical_p=input.typical_p,
        temperature=input.temperature,
        repetition_penalty=input.repetition_penalty,
-        streaming=input.streaming,
+        streaming=input.stream,
        timeout=600,
    )

--- a/comps/llms/faq-generation/tgi/langchain/README.md
+++ b/comps/llms/faq-generation/tgi/langchain/README.md
@@ -60,16 +60,16 @@ curl http://${your_ip}:9000/v1/health_check\

 ```bash
 # Streaming Response
-# Set streaming to True. Default will be True.
+# Set stream to True. Default will be True.
 curl http://${your_ip}:9000/v1/faqgen \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
  -H 'Content-Type: application/json'

 # Non-Streaming Response
-# Set streaming to False.
+# Set stream to False.
 curl http://${your_ip}:9000/v1/faqgen \
  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "streaming":false}' \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
  -H 'Content-Type: application/json'
 ```
--- a/comps/llms/faq-generation/tgi/langchain/llm.py
+++ b/comps/llms/faq-generation/tgi/langchain/llm.py
@@ -57,7 +57,7 @@ async def llm_generate(input: LLMParamsDoc):
        typical_p=input.typical_p,
        temperature=input.temperature,
        repetition_penalty=input.repetition_penalty,
-        streaming=input.streaming,
+        streaming=input.stream,
        server_kwargs=server_kwargs,
    )
    templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
@@ -71,7 +71,7 @@ async def llm_generate(input: LLMParamsDoc):
    # Create multiple documents
    docs = [Document(page_content=t) for t in texts]

-    if input.streaming:
+    if input.stream:

        async def stream_generator():
            from langserve.serialization import WellKnownLCSerializer
--- a/comps/llms/faq-generation/vllm/langchain/README.md
+++ b/comps/llms/faq-generation/vllm/langchain/README.md
@@ -62,16 +62,16 @@ curl http://${your_ip}:9000/v1/health_check\

 ```bash
 # Streaming Response
-# Set streaming to True. Default will be True.
+# Set stream to True. Default will be True.
 curl http://${your_ip}:9000/v1/faqgen \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
  -H 'Content-Type: application/json'

 # Non-Streaming Response
-# Set streaming to False.
+# Set stream to False.
 curl http://${your_ip}:9000/v1/faqgen \
  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "streaming":false}' \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
  -H 'Content-Type: application/json'
 ```
--- a/comps/llms/faq-generation/vllm/langchain/llm.py
+++ b/comps/llms/faq-generation/vllm/langchain/llm.py
@@ -58,7 +58,7 @@ async def llm_generate(input: LLMParamsDoc):
        default_headers=headers,
        max_tokens=input.max_tokens,
        top_p=input.top_p,
-        streaming=input.streaming,
+        streaming=input.stream,
        temperature=input.temperature,
    )

@@ -73,7 +73,7 @@ async def llm_generate(input: LLMParamsDoc):
    # Create multiple documents
    docs = [Document(page_content=t) for t in texts]

-    if input.streaming:
+    if input.stream:

        async def stream_generator():
            from langserve.serialization import WellKnownLCSerializer
--- a/comps/llms/src/text-generation/README.md
+++ b/comps/llms/src/text-generation/README.md
@@ -76,9 +76,9 @@ curl http://${your_ip}:9000/v1/health_check\

 ### 3.2 Consume LLM Service

-You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
+You can set the following model parameters according to your actual needs, such as `max_tokens`, `stream`.

-The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
+The `stream` parameter determines the format of the data returned by the API. It will return text string with `stream=false`, return text stream flow with `stream=true`.

 ```bash
 # stream mode
--- a/comps/llms/src/text-generation/integrations/opea.py
+++ b/comps/llms/src/text-generation/integrations/opea.py
@@ -133,7 +133,7 @@ class OPEALLM(OpeaComponent):
                messages=prompt,
                max_tokens=input.max_tokens,
                top_p=input.top_p,
-                stream=input.streaming,
+                stream=input.stream,
                frequency_penalty=input.frequency_penalty,
                temperature=input.temperature,
            )
--- a/comps/llms/summarization/tgi/langchain/README.md
+++ b/comps/llms/summarization/tgi/langchain/README.md
@@ -105,22 +105,22 @@ If you want to deal with long context, can select suitable summary type, details
 #### 3.2.1 Basic usage

 ```bash
-# Enable streaming to receive a streaming response. By default, this is set to True.
+# Enable stream to receive a stream response. By default, this is set to True.
 curl http://${your_ip}:9000/v1/chat/docsum \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
  -H 'Content-Type: application/json'

-# Disable streaming to receive a non-streaming response.
+# Disable stream to receive a non-stream response.
 curl http://${your_ip}:9000/v1/chat/docsum \
  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}' \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
  -H 'Content-Type: application/json'

 # Use Chinese mode
 curl http://${your_ip}:9000/v1/chat/docsum \
  -X POST \
-  -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}' \
+  -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
  -H 'Content-Type: application/json'
 ```

@@ -147,14 +147,14 @@ curl http://${your_ip}:9000/v1/chat/docsum \

 **summary_type=map_reduce**

-Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
+Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.

 In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`

 ```bash
 curl http://${your_ip}:9000/v1/chat/docsum \
  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}' \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
  -H 'Content-Type: application/json'
 ```

--- a/comps/llms/summarization/tgi/langchain/llm.py
+++ b/comps/llms/summarization/tgi/langchain/llm.py
@@ -171,9 +171,9 @@ async def llm_generate(input: DocSumLLMParams):
        server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}

    ## LLM
-    if input.streaming and input.summary_type == "map_reduce":
-        logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
-        input.streaming = False
+    if input.stream and input.summary_type == "map_reduce":
+        logger.info("Map Reduce mode don't support stream=True, set to stream=False")
+        input.stream = False
    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
    llm = HuggingFaceEndpoint(
        endpoint_url=llm_endpoint,
@@ -183,7 +183,7 @@ async def llm_generate(input: DocSumLLMParams):
        typical_p=input.typical_p,
        temperature=input.temperature,
        repetition_penalty=input.repetition_penalty,
-        streaming=input.streaming,
+        streaming=input.stream,
        server_kwargs=server_kwargs,
    )

@@ -209,7 +209,7 @@ async def llm_generate(input: DocSumLLMParams):
    else:
        raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')

-    if input.streaming:
+    if input.stream:

        async def stream_generator():
            from langserve.serialization import WellKnownLCSerializer
--- a/comps/llms/summarization/vllm/langchain/README.md
+++ b/comps/llms/summarization/vllm/langchain/README.md
@@ -104,22 +104,22 @@ If you want to deal with long context, can select suitable summary type, details
 #### 3.2.1 Basic usage

 ```bash
-# Enable streaming to receive a streaming response. By default, this is set to True.
+# Enable stream to receive a stream response. By default, this is set to True.
 curl http://${your_ip}:9000/v1/chat/docsum \
  -X POST \
  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
  -H 'Content-Type: application/json'

-# Disable streaming to receive a non-streaming response.
+# Disable stream to receive a non-stream response.
 curl http://${your_ip}:9000/v1/chat/docsum \
  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}' \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
  -H 'Content-Type: application/json'

 # Use Chinese mode
 curl http://${your_ip}:9000/v1/chat/docsum \
  -X POST \
-  -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}' \
+  -d '{"query":"2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
  -H 'Content-Type: application/json'
 ```

@@ -146,14 +146,14 @@ curl http://${your_ip}:9000/v1/chat/docsum \

 **summary_type=map_reduce**

-Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
+Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.

 In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`

 ```bash
 curl http://${your_ip}:9000/v1/chat/docsum \
  -X POST \
-  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}' \
+  -d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
  -H 'Content-Type: application/json'
 ```

--- a/comps/llms/summarization/vllm/langchain/llm.py
+++ b/comps/llms/summarization/vllm/langchain/llm.py
@@ -172,9 +172,9 @@ async def llm_generate(input: DocSumLLMParams):
        headers = {"Authorization": f"Bearer {access_token}"}

    ## LLM
-    if input.streaming and input.summary_type == "map_reduce":
-        logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
-        input.streaming = False
+    if input.stream and input.summary_type == "map_reduce":
+        logger.info("Map Reduce mode don't support stream=True, set to stream=False")
+        input.stream = False
    llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
    model = input.model if input.model else os.getenv("LLM_MODEL_ID")
    llm = VLLMOpenAI(
@@ -184,7 +184,7 @@ async def llm_generate(input: DocSumLLMParams):
        default_headers=headers,
        max_tokens=input.max_tokens,
        top_p=input.top_p,
-        streaming=input.streaming,
+        streaming=input.stream,
        temperature=input.temperature,
        presence_penalty=input.repetition_penalty,
    )
@@ -211,7 +211,7 @@ async def llm_generate(input: DocSumLLMParams):
    else:
        raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')

-    if input.streaming:
+    if input.stream:

        async def stream_generator():
            from langserve.serialization import WellKnownLCSerializer
--- a/comps/llms/text-generation/README.md
+++ b/comps/llms/text-generation/README.md
@@ -269,12 +269,12 @@ curl http://${host_ip}:8008/v1/chat/completions \

 ### 3.3 Consume LLM Service

-You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
+You can set the following model parameters according to your actual needs, such as `max_tokens`, `stream`.

-The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
+The `stream` parameter determines the format of the data returned by the API. It will return text string with `stream=false`, return text stream flow with `stream=true`.

 ```bash
-# non-streaming mode
+# non-stream mode
 curl http://${your_ip}:9000/v1/chat/completions \
  -X POST \
  -H 'Content-Type: application/json' \
@@ -286,11 +286,11 @@ curl http://${your_ip}:9000/v1/chat/completions \
  "typical_p":0.95,
  "temperature":0.01,
  "repetition_penalty":1.03,
-  "streaming":false
+  "stream":false
  }'


-# streaming mode
+# stream mode
 curl http://${your_ip}:9000/v1/chat/completions \
  -X POST \
  -H 'Content-Type: application/json' \
@@ -302,7 +302,7 @@ curl http://${your_ip}:9000/v1/chat/completions \
  "typical_p":0.95,
  "temperature":0.01,
  "repetition_penalty":1.03,
-  "streaming":true
+  "stream":true
  }'

 ```
--- a/comps/llms/text-generation/native/langchain/README.md
+++ b/comps/llms/text-generation/native/langchain/README.md
@@ -1,6 +1,6 @@
 # LLM Native Microservice

-LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-streaming formats. This streamlined approach optimizes performance on Habana hardware.
+LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-stream formats. This streamlined approach optimizes performance on Habana hardware.

 ## 🚀1. Start Microservice

--- a/comps/llms/text-generation/ollama/langchain/README.md
+++ b/comps/llms/text-generation/ollama/langchain/README.md
@@ -70,5 +70,5 @@ docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy
 ## Consume the Ollama Microservice

 ```bash
-curl http://127.0.0.1:9000/v1/chat/completions  -X POST   -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}'   -H 'Content-Type: application/json'
+curl http://127.0.0.1:9000/v1/chat/completions  -X POST   -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}'   -H 'Content-Type: application/json'
 ```
--- a/comps/llms/text-generation/ollama/langchain/llm.py
+++ b/comps/llms/text-generation/ollama/langchain/llm.py
@@ -32,7 +32,7 @@ async def llm_generate(input: LLMParamsDoc):
        repeat_penalty=input.repetition_penalty,
    )
    # assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3`
-    if input.streaming:
+    if input.stream:

        async def stream_generator():
            chat_response = ""
--- a/comps/llms/text-generation/predictionguard/README.md
+++ b/comps/llms/text-generation/predictionguard/README.md
@@ -21,7 +21,7 @@ docker run -d -p 9000:9000 -e PREDICTIONGUARD_API_KEY=$PREDICTIONGUARD_API_KEY

 See the [Prediction Guard docs](https://docs.predictionguard.com/) for available model options.

-### Without streaming
+### Without stream

 ```bash
 curl -X POST http://localhost:9000/v1/chat/completions \
@@ -37,7 +37,7 @@ curl -X POST http://localhost:9000/v1/chat/completions \
    }'
 ```

-### With streaming
+### With stream

 ```bash
 curl -N -X POST http://localhost:9000/v1/chat/completions \
--- a/comps/llms/text-generation/predictionguard/llm_predictionguard.py
+++ b/comps/llms/text-generation/predictionguard/llm_predictionguard.py
@@ -41,7 +41,7 @@ def llm_generate(input: LLMParamsDoc):
        {"role": "user", "content": input.query},
    ]

-    if input.streaming:
+    if input.stream:

        async def stream_generator():
            chat_response = ""
--- a/comps/llms/text-generation/vllm/langchain/README.md
+++ b/comps/llms/text-generation/vllm/langchain/README.md
@@ -94,7 +94,7 @@ bash ./launch_vllm_service.sh 8008 meta-llama/Meta-Llama-3-70b hpu 8

 ### 2.3 vLLM with OpenVINO (on Intel GPU and CPU)

-vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on Intel GPU and all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (starting from Intel® UHD Graphics generation). OpenVINO vLLM backend supports the following advanced vLLM features:
+vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.md) and can perform optimal model serving on Intel GPU and all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (starting from Intel® UHD Graphics generation). OpenVINO vLLM backend supports the following advanced vLLM features:

 - Prefix caching (`--enable-prefix-caching`)
 - Chunked prefill (`--enable-chunked-prefill`)
@@ -220,7 +220,7 @@ curl http://${your_ip}:9000/v1/health_check\
 User can set the following model parameters according to needs:

 - max_tokens: Total output token
- streaming(true/false): return text response in streaming mode or non-streaming mode
+- stream(true/false): return text response in stream mode or non-stream mode

 ```bash
 # stream mode
--- a/comps/llms/text-generation/vllm/langchain/llm.py
+++ b/comps/llms/text-generation/vllm/langchain/llm.py
@@ -120,7 +120,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
        if logflag:
            logger.info(f"[ SearchedDoc ] final input: {new_input}")

-        if new_input.streaming:
+        if new_input.stream:

            async def stream_generator():
                chat_response = ""
@@ -172,7 +172,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
                # use rag default template
                prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents, model_name)

-        if input.streaming:
+        if input.stream:

            async def stream_generator():
                chat_response = ""
--- a/comps/lvms/tgi-llava/lvm_tgi.py
+++ b/comps/lvms/tgi-llava/lvm_tgi.py
@@ -70,7 +70,7 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
                    f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
                )
        max_new_tokens = request.max_new_tokens
-        streaming = request.streaming
+        stream = request.stream
        repetition_penalty = request.repetition_penalty
        temperature = request.temperature
        top_k = request.top_k
@@ -82,7 +82,7 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
        img_b64_str = request.image
        prompt = request.prompt
        max_new_tokens = request.max_new_tokens
-        streaming = request.streaming
+        stream = request.stream
        repetition_penalty = request.repetition_penalty
        temperature = request.temperature
        top_k = request.top_k
@@ -98,13 +98,13 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
    image = f"data:image/png;base64,{img_b64_str}"
    image_prompt = f"![]({image})\n{prompt}\nASSISTANT:"

-    if streaming:
+    if stream:

        async def stream_generator():
            chat_response = ""
            text_generation = await lvm_client.text_generation(
                prompt=image_prompt,
-                stream=streaming,
+                stream=stream,
                max_new_tokens=max_new_tokens,
                repetition_penalty=repetition_penalty,
                temperature=temperature,
--- a/comps/ragas/src/tgi/langchain/llm.py
+++ b/comps/ragas/src/tgi/langchain/llm.py
@@ -48,7 +48,7 @@ def llm_generate(input: RAGASParams):
        typical_p=input.typical_p,
        temperature=input.temperature,
        repetition_penalty=input.repetition_penalty,
-        streaming=input.streaming,
+        streaming=input.stream,
        timeout=600,
    )