Rename streaming to stream to align with OpenAI API (#1098)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
This commit is contained in:
XinyaoWa
2025-01-06 13:25:47 +08:00
committed by GitHub
parent 6419ace56c
commit 679e6664d4
41 changed files with 90 additions and 90 deletions

View File

@@ -56,7 +56,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, AgentCo
if logflag:
logger.info(input)
input.streaming = args.streaming
input.stream = args.stream
config = {"recursion_limit": args.recursion_limit}
if args.with_memory:
@@ -79,7 +79,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, AgentCo
input_query = input.messages[-1]["content"]
# 2. prepare the input for the agent
if input.streaming:
if input.stream:
logger.info("-----------STREAMING-------------")
return StreamingResponse(agent_inst.stream_generator(input_query, config), media_type="text/event-stream")

View File

@@ -40,8 +40,8 @@ if os.environ.get("role_description") is not None:
if os.environ.get("tools") is not None:
env_config += ["--tools", os.environ["tools"]]
if os.environ.get("streaming") is not None:
env_config += ["--streaming", os.environ["streaming"]]
if os.environ.get("stream") is not None:
env_config += ["--stream", os.environ["stream"]]
if os.environ.get("max_new_tokens") is not None:
env_config += ["--max_new_tokens", os.environ["max_new_tokens"]]

View File

@@ -38,7 +38,7 @@ The agent node takes user question, hints (optional) and history (when available
## Limitations
1. Agent is only allowed to issue "SELECT" commands to databases, i.e., agent can only query databases but cannot update databases.
2. We currently does not support "streaming" agent outputs on the fly for `sql_agent_llama`.
2. We currently does not support "stream" agent outputs on the fly for `sql_agent_llama`.
3. Users need to pass the SQL database URI to the agent with the `db_path` environment variable. We have only validated SQLite database connected in such way.
Please submit issues if you want new features to be added. We also welcome community contributions!

View File

@@ -35,7 +35,7 @@ def setup_hf_tgi_client(args):
"temperature": args.temperature,
"repetition_penalty": args.repetition_penalty,
"return_full_text": args.return_full_text,
"streaming": args.streaming,
"stream": args.stream,
}
llm = HuggingFaceEndpoint(
@@ -53,7 +53,7 @@ def setup_chat_model(args):
"temperature": args.temperature,
"max_tokens": args.max_new_tokens,
"top_p": args.top_p,
"streaming": args.streaming,
"stream": args.stream,
}
if args.llm_engine == "vllm" or args.llm_engine == "tgi":
openai_endpoint = f"{args.llm_endpoint_url}/v1"
@@ -115,7 +115,7 @@ def adapt_custom_prompt(local_vars, custom_prompt):
def get_args():
parser = argparse.ArgumentParser()
# llm args
parser.add_argument("--streaming", type=str, default="true")
parser.add_argument("--stream", type=str, default="true")
parser.add_argument("--port", type=int, default=9090)
parser.add_argument("--agent_name", type=str, default="OPEA_Default_Agent")
parser.add_argument("--strategy", type=str, default="react_langchain")
@@ -153,10 +153,10 @@ def get_args():
for key, value in vars(env_args).items():
setattr(sys_args, key, value)
if sys_args.streaming == "true":
sys_args.streaming = True
if sys_args.stream == "true":
sys_args.stream = True
else:
sys_args.streaming = False
sys_args.stream = False
if sys_args.use_hints == "true":
print("SQL agent will use hints")

View File

@@ -121,7 +121,7 @@ class ServiceOrchestrator(DAG):
downstreams.remove(downstream)
except re.error as e:
logger.error("Pattern invalid! Operation cancelled.")
if len(downstreams) == 0 and llm_parameters.streaming:
if len(downstreams) == 0 and llm_parameters.stream:
# turn the response to a StreamingResponse
# to make the response uniform to UI
def fake_stream(text):
@@ -153,7 +153,7 @@ class ServiceOrchestrator(DAG):
if node not in nodes_to_keep:
runtime_graph.delete_node_if_exists(node)
if not llm_parameters.streaming:
if not llm_parameters.stream:
self.metrics.pending_update(False)
return result_dict, runtime_graph
@@ -189,7 +189,7 @@ class ServiceOrchestrator(DAG):
# pre-process
inputs = self.align_inputs(inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs)
if is_llm_vlm and llm_parameters.streaming:
if is_llm_vlm and llm_parameters.stream:
# Still leave to sync requests.post for StreamingResponse
if LOGFLAG:
logger.info(inputs)
@@ -203,7 +203,7 @@ class ServiceOrchestrator(DAG):
)
downstream = runtime_graph.downstream(cur_node)
if downstream:
assert len(downstream) == 1, "Not supported multiple streaming downstreams yet!"
assert len(downstream) == 1, "Not supported multiple stream downstreams yet!"
cur_node = downstream[0]
hitted_ends = [".", "?", "!", "", "", ""]
downstream_endpoint = self.services[downstream[0]].endpoint_path

View File

@@ -150,7 +150,7 @@ class LVMSearchedMultimodalDoc(SearchedMultimodalDoc):
top_p: float = 0.95
typical_p: float = 0.95
temperature: float = 0.01
streaming: bool = False
stream: bool = False
repetition_penalty: float = 1.03
chat_template: Optional[str] = Field(
default=None,
@@ -184,7 +184,7 @@ class LLMParamsDoc(BaseDoc):
frequency_penalty: float = 0.0
presence_penalty: float = 0.0
repetition_penalty: float = 1.03
streaming: bool = True
stream: bool = True
language: str = "auto" # can be "en", "zh"
chat_template: Optional[str] = Field(
@@ -229,7 +229,7 @@ class LLMParams(BaseDoc):
frequency_penalty: float = 0.0
presence_penalty: float = 0.0
repetition_penalty: float = 1.03
streaming: bool = True
stream: bool = True
language: str = "auto" # can be "en", "zh"
chat_template: Optional[str] = Field(
@@ -292,7 +292,7 @@ class LVMDoc(BaseDoc):
typical_p: float = 0.95
temperature: float = 0.01
repetition_penalty: float = 1.03
streaming: bool = False
stream: bool = False
class LVMVideoDoc(BaseDoc):

View File

@@ -52,7 +52,7 @@ Applications' megaservice `ServiceOrchectrator` provides following metrics:
Latency ones are histogram metrics i.e. include count, total value and set of value buckets for each item.
They are available only for _streaming_ requests using LLM. Pending count accounts for all requests.
They are available only for _stream_ requests using LLM. Pending count accounts for all requests.
### Inferencing Metrics

View File

@@ -83,6 +83,6 @@ Once intent detection microservice is started, user can use below command to inv
```bash
curl http://${your_ip}:9000/v1/chat/intent\
-X POST \
-d '{"query":"What is Deep Learning?","max_new_tokens":10,"top_k":1,"temperature":0.001,"streaming":false}' \
-d '{"query":"What is Deep Learning?","max_new_tokens":10,"top_k":1,"temperature":0.001,"stream":false}' \
-H 'Content-Type: application/json'
```

View File

@@ -26,7 +26,7 @@ async def llm_generate(input: LLMParamsDoc):
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.streaming,
streaming=input.stream,
timeout=600,
)

View File

@@ -60,16 +60,16 @@ curl http://${your_ip}:9000/v1/health_check\
```bash
# Streaming Response
# Set streaming to True. Default will be True.
# Set stream to True. Default will be True.
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-H 'Content-Type: application/json'
# Non-Streaming Response
# Set streaming to False.
# Set stream to False.
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "streaming":false}' \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
-H 'Content-Type: application/json'
```

View File

@@ -57,7 +57,7 @@ async def llm_generate(input: LLMParamsDoc):
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.streaming,
streaming=input.stream,
server_kwargs=server_kwargs,
)
templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
@@ -71,7 +71,7 @@ async def llm_generate(input: LLMParamsDoc):
# Create multiple documents
docs = [Document(page_content=t) for t in texts]
if input.streaming:
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer

View File

@@ -62,16 +62,16 @@ curl http://${your_ip}:9000/v1/health_check\
```bash
# Streaming Response
# Set streaming to True. Default will be True.
# Set stream to True. Default will be True.
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
-H 'Content-Type: application/json'
# Non-Streaming Response
# Set streaming to False.
# Set stream to False.
curl http://${your_ip}:9000/v1/faqgen \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "streaming":false}' \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
-H 'Content-Type: application/json'
```

View File

@@ -58,7 +58,7 @@ async def llm_generate(input: LLMParamsDoc):
default_headers=headers,
max_tokens=input.max_tokens,
top_p=input.top_p,
streaming=input.streaming,
streaming=input.stream,
temperature=input.temperature,
)
@@ -73,7 +73,7 @@ async def llm_generate(input: LLMParamsDoc):
# Create multiple documents
docs = [Document(page_content=t) for t in texts]
if input.streaming:
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer

View File

@@ -76,9 +76,9 @@ curl http://${your_ip}:9000/v1/health_check\
### 3.2 Consume LLM Service
You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
You can set the following model parameters according to your actual needs, such as `max_tokens`, `stream`.
The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
The `stream` parameter determines the format of the data returned by the API. It will return text string with `stream=false`, return text stream flow with `stream=true`.
```bash
# stream mode

View File

@@ -133,7 +133,7 @@ class OPEALLM(OpeaComponent):
messages=prompt,
max_tokens=input.max_tokens,
top_p=input.top_p,
stream=input.streaming,
stream=input.stream,
frequency_penalty=input.frequency_penalty,
temperature=input.temperature,
)

View File

@@ -105,22 +105,22 @@ If you want to deal with long context, can select suitable summary type, details
#### 3.2.1 Basic usage
```bash
# Enable streaming to receive a streaming response. By default, this is set to True.
# Enable stream to receive a stream response. By default, this is set to True.
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
-H 'Content-Type: application/json'
# Disable streaming to receive a non-streaming response.
# Disable stream to receive a non-stream response.
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}' \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
-H 'Content-Type: application/json'
# Use Chinese mode
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"2024年9月26日北京——今日英特尔正式发布英特尔® 至强® 6性能核处理器代号Granite Rapids为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}' \
-d '{"query":"2024年9月26日北京——今日英特尔正式发布英特尔® 至强® 6性能核处理器代号Granite Rapids为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
-H 'Content-Type: application/json'
```
@@ -147,14 +147,14 @@ curl http://${your_ip}:9000/v1/chat/docsum \
**summary_type=map_reduce**
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}' \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
-H 'Content-Type: application/json'
```

View File

@@ -171,9 +171,9 @@ async def llm_generate(input: DocSumLLMParams):
server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
## LLM
if input.streaming and input.summary_type == "map_reduce":
logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
input.streaming = False
if input.stream and input.summary_type == "map_reduce":
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
input.stream = False
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
llm = HuggingFaceEndpoint(
endpoint_url=llm_endpoint,
@@ -183,7 +183,7 @@ async def llm_generate(input: DocSumLLMParams):
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.streaming,
streaming=input.stream,
server_kwargs=server_kwargs,
)
@@ -209,7 +209,7 @@ async def llm_generate(input: DocSumLLMParams):
else:
raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
if input.streaming:
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer

View File

@@ -104,22 +104,22 @@ If you want to deal with long context, can select suitable summary type, details
#### 3.2.1 Basic usage
```bash
# Enable streaming to receive a streaming response. By default, this is set to True.
# Enable stream to receive a stream response. By default, this is set to True.
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
-H 'Content-Type: application/json'
# Disable streaming to receive a non-streaming response.
# Disable stream to receive a non-stream response.
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}' \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
-H 'Content-Type: application/json'
# Use Chinese mode
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"2024年9月26日北京——今日英特尔正式发布英特尔® 至强® 6性能核处理器代号Granite Rapids为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}' \
-d '{"query":"2024年9月26日北京——今日英特尔正式发布英特尔® 至强® 6性能核处理器代号Granite Rapids为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
-H 'Content-Type: application/json'
```
@@ -146,14 +146,14 @@ curl http://${your_ip}:9000/v1/chat/docsum \
**summary_type=map_reduce**
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
```bash
curl http://${your_ip}:9000/v1/chat/docsum \
-X POST \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}' \
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
-H 'Content-Type: application/json'
```

View File

@@ -172,9 +172,9 @@ async def llm_generate(input: DocSumLLMParams):
headers = {"Authorization": f"Bearer {access_token}"}
## LLM
if input.streaming and input.summary_type == "map_reduce":
logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
input.streaming = False
if input.stream and input.summary_type == "map_reduce":
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
input.stream = False
llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
model = input.model if input.model else os.getenv("LLM_MODEL_ID")
llm = VLLMOpenAI(
@@ -184,7 +184,7 @@ async def llm_generate(input: DocSumLLMParams):
default_headers=headers,
max_tokens=input.max_tokens,
top_p=input.top_p,
streaming=input.streaming,
streaming=input.stream,
temperature=input.temperature,
presence_penalty=input.repetition_penalty,
)
@@ -211,7 +211,7 @@ async def llm_generate(input: DocSumLLMParams):
else:
raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
if input.streaming:
if input.stream:
async def stream_generator():
from langserve.serialization import WellKnownLCSerializer

View File

@@ -269,12 +269,12 @@ curl http://${host_ip}:8008/v1/chat/completions \
### 3.3 Consume LLM Service
You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
You can set the following model parameters according to your actual needs, such as `max_tokens`, `stream`.
The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
The `stream` parameter determines the format of the data returned by the API. It will return text string with `stream=false`, return text stream flow with `stream=true`.
```bash
# non-streaming mode
# non-stream mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-H 'Content-Type: application/json' \
@@ -286,11 +286,11 @@ curl http://${your_ip}:9000/v1/chat/completions \
"typical_p":0.95,
"temperature":0.01,
"repetition_penalty":1.03,
"streaming":false
"stream":false
}'
# streaming mode
# stream mode
curl http://${your_ip}:9000/v1/chat/completions \
-X POST \
-H 'Content-Type: application/json' \
@@ -302,7 +302,7 @@ curl http://${your_ip}:9000/v1/chat/completions \
"typical_p":0.95,
"temperature":0.01,
"repetition_penalty":1.03,
"streaming":true
"stream":true
}'
```

View File

@@ -1,6 +1,6 @@
# LLM Native Microservice
LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-streaming formats. This streamlined approach optimizes performance on Habana hardware.
LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-stream formats. This streamlined approach optimizes performance on Habana hardware.
## 🚀1. Start Microservice

View File

@@ -70,5 +70,5 @@ docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy
## Consume the Ollama Microservice
```bash
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' -H 'Content-Type: application/json'
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' -H 'Content-Type: application/json'
```

View File

@@ -32,7 +32,7 @@ async def llm_generate(input: LLMParamsDoc):
repeat_penalty=input.repetition_penalty,
)
# assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3`
if input.streaming:
if input.stream:
async def stream_generator():
chat_response = ""

View File

@@ -21,7 +21,7 @@ docker run -d -p 9000:9000 -e PREDICTIONGUARD_API_KEY=$PREDICTIONGUARD_API_KEY
See the [Prediction Guard docs](https://docs.predictionguard.com/) for available model options.
### Without streaming
### Without stream
```bash
curl -X POST http://localhost:9000/v1/chat/completions \
@@ -37,7 +37,7 @@ curl -X POST http://localhost:9000/v1/chat/completions \
}'
```
### With streaming
### With stream
```bash
curl -N -X POST http://localhost:9000/v1/chat/completions \

View File

@@ -41,7 +41,7 @@ def llm_generate(input: LLMParamsDoc):
{"role": "user", "content": input.query},
]
if input.streaming:
if input.stream:
async def stream_generator():
chat_response = ""

View File

@@ -94,7 +94,7 @@ bash ./launch_vllm_service.sh 8008 meta-llama/Meta-Llama-3-70b hpu 8
### 2.3 vLLM with OpenVINO (on Intel GPU and CPU)
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on Intel GPU and all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (starting from Intel® UHD Graphics generation). OpenVINO vLLM backend supports the following advanced vLLM features:
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.md) and can perform optimal model serving on Intel GPU and all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (starting from Intel® UHD Graphics generation). OpenVINO vLLM backend supports the following advanced vLLM features:
- Prefix caching (`--enable-prefix-caching`)
- Chunked prefill (`--enable-chunked-prefill`)
@@ -220,7 +220,7 @@ curl http://${your_ip}:9000/v1/health_check\
User can set the following model parameters according to needs:
- max_tokens: Total output token
- streaming(true/false): return text response in streaming mode or non-streaming mode
- stream(true/false): return text response in stream mode or non-stream mode
```bash
# stream mode

View File

@@ -120,7 +120,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
if logflag:
logger.info(f"[ SearchedDoc ] final input: {new_input}")
if new_input.streaming:
if new_input.stream:
async def stream_generator():
chat_response = ""
@@ -172,7 +172,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
# use rag default template
prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents, model_name)
if input.streaming:
if input.stream:
async def stream_generator():
chat_response = ""

View File

@@ -70,7 +70,7 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
max_new_tokens = request.max_new_tokens
streaming = request.streaming
stream = request.stream
repetition_penalty = request.repetition_penalty
temperature = request.temperature
top_k = request.top_k
@@ -82,7 +82,7 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
img_b64_str = request.image
prompt = request.prompt
max_new_tokens = request.max_new_tokens
streaming = request.streaming
stream = request.stream
repetition_penalty = request.repetition_penalty
temperature = request.temperature
top_k = request.top_k
@@ -98,13 +98,13 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
image = f"data:image/png;base64,{img_b64_str}"
image_prompt = f"![]({image})\n{prompt}\nASSISTANT:"
if streaming:
if stream:
async def stream_generator():
chat_response = ""
text_generation = await lvm_client.text_generation(
prompt=image_prompt,
stream=streaming,
stream=stream,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
temperature=temperature,

View File

@@ -48,7 +48,7 @@ def llm_generate(input: RAGASParams):
typical_p=input.typical_p,
temperature=input.temperature,
repetition_penalty=input.repetition_penalty,
streaming=input.streaming,
streaming=input.stream,
timeout=600,
)