Rename streaming to stream to align with OpenAI API (#1098)
Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
This commit is contained in:
@@ -56,7 +56,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, AgentCo
|
||||
if logflag:
|
||||
logger.info(input)
|
||||
|
||||
input.streaming = args.streaming
|
||||
input.stream = args.stream
|
||||
config = {"recursion_limit": args.recursion_limit}
|
||||
|
||||
if args.with_memory:
|
||||
@@ -79,7 +79,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, AgentCo
|
||||
input_query = input.messages[-1]["content"]
|
||||
|
||||
# 2. prepare the input for the agent
|
||||
if input.streaming:
|
||||
if input.stream:
|
||||
logger.info("-----------STREAMING-------------")
|
||||
return StreamingResponse(agent_inst.stream_generator(input_query, config), media_type="text/event-stream")
|
||||
|
||||
|
||||
@@ -40,8 +40,8 @@ if os.environ.get("role_description") is not None:
|
||||
if os.environ.get("tools") is not None:
|
||||
env_config += ["--tools", os.environ["tools"]]
|
||||
|
||||
if os.environ.get("streaming") is not None:
|
||||
env_config += ["--streaming", os.environ["streaming"]]
|
||||
if os.environ.get("stream") is not None:
|
||||
env_config += ["--stream", os.environ["stream"]]
|
||||
|
||||
if os.environ.get("max_new_tokens") is not None:
|
||||
env_config += ["--max_new_tokens", os.environ["max_new_tokens"]]
|
||||
|
||||
@@ -38,7 +38,7 @@ The agent node takes user question, hints (optional) and history (when available
|
||||
## Limitations
|
||||
|
||||
1. Agent is only allowed to issue "SELECT" commands to databases, i.e., agent can only query databases but cannot update databases.
|
||||
2. We currently does not support "streaming" agent outputs on the fly for `sql_agent_llama`.
|
||||
2. We currently does not support "stream" agent outputs on the fly for `sql_agent_llama`.
|
||||
3. Users need to pass the SQL database URI to the agent with the `db_path` environment variable. We have only validated SQLite database connected in such way.
|
||||
|
||||
Please submit issues if you want new features to be added. We also welcome community contributions!
|
||||
|
||||
@@ -35,7 +35,7 @@ def setup_hf_tgi_client(args):
|
||||
"temperature": args.temperature,
|
||||
"repetition_penalty": args.repetition_penalty,
|
||||
"return_full_text": args.return_full_text,
|
||||
"streaming": args.streaming,
|
||||
"stream": args.stream,
|
||||
}
|
||||
|
||||
llm = HuggingFaceEndpoint(
|
||||
@@ -53,7 +53,7 @@ def setup_chat_model(args):
|
||||
"temperature": args.temperature,
|
||||
"max_tokens": args.max_new_tokens,
|
||||
"top_p": args.top_p,
|
||||
"streaming": args.streaming,
|
||||
"stream": args.stream,
|
||||
}
|
||||
if args.llm_engine == "vllm" or args.llm_engine == "tgi":
|
||||
openai_endpoint = f"{args.llm_endpoint_url}/v1"
|
||||
@@ -115,7 +115,7 @@ def adapt_custom_prompt(local_vars, custom_prompt):
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
# llm args
|
||||
parser.add_argument("--streaming", type=str, default="true")
|
||||
parser.add_argument("--stream", type=str, default="true")
|
||||
parser.add_argument("--port", type=int, default=9090)
|
||||
parser.add_argument("--agent_name", type=str, default="OPEA_Default_Agent")
|
||||
parser.add_argument("--strategy", type=str, default="react_langchain")
|
||||
@@ -153,10 +153,10 @@ def get_args():
|
||||
for key, value in vars(env_args).items():
|
||||
setattr(sys_args, key, value)
|
||||
|
||||
if sys_args.streaming == "true":
|
||||
sys_args.streaming = True
|
||||
if sys_args.stream == "true":
|
||||
sys_args.stream = True
|
||||
else:
|
||||
sys_args.streaming = False
|
||||
sys_args.stream = False
|
||||
|
||||
if sys_args.use_hints == "true":
|
||||
print("SQL agent will use hints")
|
||||
|
||||
@@ -121,7 +121,7 @@ class ServiceOrchestrator(DAG):
|
||||
downstreams.remove(downstream)
|
||||
except re.error as e:
|
||||
logger.error("Pattern invalid! Operation cancelled.")
|
||||
if len(downstreams) == 0 and llm_parameters.streaming:
|
||||
if len(downstreams) == 0 and llm_parameters.stream:
|
||||
# turn the response to a StreamingResponse
|
||||
# to make the response uniform to UI
|
||||
def fake_stream(text):
|
||||
@@ -153,7 +153,7 @@ class ServiceOrchestrator(DAG):
|
||||
if node not in nodes_to_keep:
|
||||
runtime_graph.delete_node_if_exists(node)
|
||||
|
||||
if not llm_parameters.streaming:
|
||||
if not llm_parameters.stream:
|
||||
self.metrics.pending_update(False)
|
||||
|
||||
return result_dict, runtime_graph
|
||||
@@ -189,7 +189,7 @@ class ServiceOrchestrator(DAG):
|
||||
# pre-process
|
||||
inputs = self.align_inputs(inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs)
|
||||
|
||||
if is_llm_vlm and llm_parameters.streaming:
|
||||
if is_llm_vlm and llm_parameters.stream:
|
||||
# Still leave to sync requests.post for StreamingResponse
|
||||
if LOGFLAG:
|
||||
logger.info(inputs)
|
||||
@@ -203,7 +203,7 @@ class ServiceOrchestrator(DAG):
|
||||
)
|
||||
downstream = runtime_graph.downstream(cur_node)
|
||||
if downstream:
|
||||
assert len(downstream) == 1, "Not supported multiple streaming downstreams yet!"
|
||||
assert len(downstream) == 1, "Not supported multiple stream downstreams yet!"
|
||||
cur_node = downstream[0]
|
||||
hitted_ends = [".", "?", "!", "。", ",", "!"]
|
||||
downstream_endpoint = self.services[downstream[0]].endpoint_path
|
||||
|
||||
@@ -150,7 +150,7 @@ class LVMSearchedMultimodalDoc(SearchedMultimodalDoc):
|
||||
top_p: float = 0.95
|
||||
typical_p: float = 0.95
|
||||
temperature: float = 0.01
|
||||
streaming: bool = False
|
||||
stream: bool = False
|
||||
repetition_penalty: float = 1.03
|
||||
chat_template: Optional[str] = Field(
|
||||
default=None,
|
||||
@@ -184,7 +184,7 @@ class LLMParamsDoc(BaseDoc):
|
||||
frequency_penalty: float = 0.0
|
||||
presence_penalty: float = 0.0
|
||||
repetition_penalty: float = 1.03
|
||||
streaming: bool = True
|
||||
stream: bool = True
|
||||
language: str = "auto" # can be "en", "zh"
|
||||
|
||||
chat_template: Optional[str] = Field(
|
||||
@@ -229,7 +229,7 @@ class LLMParams(BaseDoc):
|
||||
frequency_penalty: float = 0.0
|
||||
presence_penalty: float = 0.0
|
||||
repetition_penalty: float = 1.03
|
||||
streaming: bool = True
|
||||
stream: bool = True
|
||||
language: str = "auto" # can be "en", "zh"
|
||||
|
||||
chat_template: Optional[str] = Field(
|
||||
@@ -292,7 +292,7 @@ class LVMDoc(BaseDoc):
|
||||
typical_p: float = 0.95
|
||||
temperature: float = 0.01
|
||||
repetition_penalty: float = 1.03
|
||||
streaming: bool = False
|
||||
stream: bool = False
|
||||
|
||||
|
||||
class LVMVideoDoc(BaseDoc):
|
||||
|
||||
@@ -52,7 +52,7 @@ Applications' megaservice `ServiceOrchectrator` provides following metrics:
|
||||
|
||||
Latency ones are histogram metrics i.e. include count, total value and set of value buckets for each item.
|
||||
|
||||
They are available only for _streaming_ requests using LLM. Pending count accounts for all requests.
|
||||
They are available only for _stream_ requests using LLM. Pending count accounts for all requests.
|
||||
|
||||
### Inferencing Metrics
|
||||
|
||||
|
||||
@@ -83,6 +83,6 @@ Once intent detection microservice is started, user can use below command to inv
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/intent\
|
||||
-X POST \
|
||||
-d '{"query":"What is Deep Learning?","max_new_tokens":10,"top_k":1,"temperature":0.001,"streaming":false}' \
|
||||
-d '{"query":"What is Deep Learning?","max_new_tokens":10,"top_k":1,"temperature":0.001,"stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
@@ -26,7 +26,7 @@ async def llm_generate(input: LLMParamsDoc):
|
||||
typical_p=input.typical_p,
|
||||
temperature=input.temperature,
|
||||
repetition_penalty=input.repetition_penalty,
|
||||
streaming=input.streaming,
|
||||
streaming=input.stream,
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
|
||||
@@ -60,16 +60,16 @@ curl http://${your_ip}:9000/v1/health_check\
|
||||
|
||||
```bash
|
||||
# Streaming Response
|
||||
# Set streaming to True. Default will be True.
|
||||
# Set stream to True. Default will be True.
|
||||
curl http://${your_ip}:9000/v1/faqgen \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Non-Streaming Response
|
||||
# Set streaming to False.
|
||||
# Set stream to False.
|
||||
curl http://${your_ip}:9000/v1/faqgen \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "streaming":false}' \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
@@ -57,7 +57,7 @@ async def llm_generate(input: LLMParamsDoc):
|
||||
typical_p=input.typical_p,
|
||||
temperature=input.temperature,
|
||||
repetition_penalty=input.repetition_penalty,
|
||||
streaming=input.streaming,
|
||||
streaming=input.stream,
|
||||
server_kwargs=server_kwargs,
|
||||
)
|
||||
templ = """Create a concise FAQs (frequently asked questions and answers) for following text:
|
||||
@@ -71,7 +71,7 @@ async def llm_generate(input: LLMParamsDoc):
|
||||
# Create multiple documents
|
||||
docs = [Document(page_content=t) for t in texts]
|
||||
|
||||
if input.streaming:
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
from langserve.serialization import WellKnownLCSerializer
|
||||
|
||||
@@ -62,16 +62,16 @@ curl http://${your_ip}:9000/v1/health_check\
|
||||
|
||||
```bash
|
||||
# Streaming Response
|
||||
# Set streaming to True. Default will be True.
|
||||
# Set stream to True. Default will be True.
|
||||
curl http://${your_ip}:9000/v1/faqgen \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Non-Streaming Response
|
||||
# Set streaming to False.
|
||||
# Set stream to False.
|
||||
curl http://${your_ip}:9000/v1/faqgen \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "streaming":false}' \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
@@ -58,7 +58,7 @@ async def llm_generate(input: LLMParamsDoc):
|
||||
default_headers=headers,
|
||||
max_tokens=input.max_tokens,
|
||||
top_p=input.top_p,
|
||||
streaming=input.streaming,
|
||||
streaming=input.stream,
|
||||
temperature=input.temperature,
|
||||
)
|
||||
|
||||
@@ -73,7 +73,7 @@ async def llm_generate(input: LLMParamsDoc):
|
||||
# Create multiple documents
|
||||
docs = [Document(page_content=t) for t in texts]
|
||||
|
||||
if input.streaming:
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
from langserve.serialization import WellKnownLCSerializer
|
||||
|
||||
@@ -76,9 +76,9 @@ curl http://${your_ip}:9000/v1/health_check\
|
||||
|
||||
### 3.2 Consume LLM Service
|
||||
|
||||
You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
|
||||
You can set the following model parameters according to your actual needs, such as `max_tokens`, `stream`.
|
||||
|
||||
The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
|
||||
The `stream` parameter determines the format of the data returned by the API. It will return text string with `stream=false`, return text stream flow with `stream=true`.
|
||||
|
||||
```bash
|
||||
# stream mode
|
||||
|
||||
@@ -133,7 +133,7 @@ class OPEALLM(OpeaComponent):
|
||||
messages=prompt,
|
||||
max_tokens=input.max_tokens,
|
||||
top_p=input.top_p,
|
||||
stream=input.streaming,
|
||||
stream=input.stream,
|
||||
frequency_penalty=input.frequency_penalty,
|
||||
temperature=input.temperature,
|
||||
)
|
||||
|
||||
@@ -105,22 +105,22 @@ If you want to deal with long context, can select suitable summary type, details
|
||||
#### 3.2.1 Basic usage
|
||||
|
||||
```bash
|
||||
# Enable streaming to receive a streaming response. By default, this is set to True.
|
||||
# Enable stream to receive a stream response. By default, this is set to True.
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Disable streaming to receive a non-streaming response.
|
||||
# Disable stream to receive a non-stream response.
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}' \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Use Chinese mode
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}' \
|
||||
-d '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
@@ -147,14 +147,14 @@ curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
|
||||
**summary_type=map_reduce**
|
||||
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
|
||||
|
||||
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}' \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
|
||||
@@ -171,9 +171,9 @@ async def llm_generate(input: DocSumLLMParams):
|
||||
server_kwargs["headers"] = {"Authorization": f"Bearer {access_token}"}
|
||||
|
||||
## LLM
|
||||
if input.streaming and input.summary_type == "map_reduce":
|
||||
logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
|
||||
input.streaming = False
|
||||
if input.stream and input.summary_type == "map_reduce":
|
||||
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
|
||||
input.stream = False
|
||||
llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
|
||||
llm = HuggingFaceEndpoint(
|
||||
endpoint_url=llm_endpoint,
|
||||
@@ -183,7 +183,7 @@ async def llm_generate(input: DocSumLLMParams):
|
||||
typical_p=input.typical_p,
|
||||
temperature=input.temperature,
|
||||
repetition_penalty=input.repetition_penalty,
|
||||
streaming=input.streaming,
|
||||
streaming=input.stream,
|
||||
server_kwargs=server_kwargs,
|
||||
)
|
||||
|
||||
@@ -209,7 +209,7 @@ async def llm_generate(input: DocSumLLMParams):
|
||||
else:
|
||||
raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
|
||||
|
||||
if input.streaming:
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
from langserve.serialization import WellKnownLCSerializer
|
||||
|
||||
@@ -104,22 +104,22 @@ If you want to deal with long context, can select suitable summary type, details
|
||||
#### 3.2.1 Basic usage
|
||||
|
||||
```bash
|
||||
# Enable streaming to receive a streaming response. By default, this is set to True.
|
||||
# Enable stream to receive a stream response. By default, this is set to True.
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en"}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Disable streaming to receive a non-streaming response.
|
||||
# Disable stream to receive a non-stream response.
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "streaming":false}' \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
|
||||
# Use Chinese mode
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "streaming":false}' \
|
||||
-d '{"query":"2024年9月26日,北京——今日,英特尔正式发布英特尔® 至强® 6性能核处理器(代号Granite Rapids),为AI、数据分析、科学计算等计算密集型业务提供卓越性能。", "max_tokens":32, "language":"zh", "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
@@ -146,14 +146,14 @@ curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
|
||||
**summary_type=map_reduce**
|
||||
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `streaming=True` is not allowed here.
|
||||
Map_reduce mode will split the inputs into multiple chunks, map each document to an individual summary, then consolidate those summaries into a single global summary. `stream=True` is not allowed here.
|
||||
|
||||
In this mode, default `chunk_size` is set to be `min(MAX_TOTAL_TOKENS - input.max_tokens - 50, MAX_INPUT_TOKENS)`
|
||||
|
||||
```bash
|
||||
curl http://${your_ip}:9000/v1/chat/docsum \
|
||||
-X POST \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "streaming":false}' \
|
||||
-d '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5.", "max_tokens":32, "language":"en", "summary_type": "map_reduce", "chunk_size": 2000, "stream":false}' \
|
||||
-H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
|
||||
@@ -172,9 +172,9 @@ async def llm_generate(input: DocSumLLMParams):
|
||||
headers = {"Authorization": f"Bearer {access_token}"}
|
||||
|
||||
## LLM
|
||||
if input.streaming and input.summary_type == "map_reduce":
|
||||
logger.info("Map Reduce mode don't support streaming=True, set to streaming=False")
|
||||
input.streaming = False
|
||||
if input.stream and input.summary_type == "map_reduce":
|
||||
logger.info("Map Reduce mode don't support stream=True, set to stream=False")
|
||||
input.stream = False
|
||||
llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8080")
|
||||
model = input.model if input.model else os.getenv("LLM_MODEL_ID")
|
||||
llm = VLLMOpenAI(
|
||||
@@ -184,7 +184,7 @@ async def llm_generate(input: DocSumLLMParams):
|
||||
default_headers=headers,
|
||||
max_tokens=input.max_tokens,
|
||||
top_p=input.top_p,
|
||||
streaming=input.streaming,
|
||||
streaming=input.stream,
|
||||
temperature=input.temperature,
|
||||
presence_penalty=input.repetition_penalty,
|
||||
)
|
||||
@@ -211,7 +211,7 @@ async def llm_generate(input: DocSumLLMParams):
|
||||
else:
|
||||
raise NotImplementedError('Please specify the summary_type in "stuff", "truncate", "map_reduce", "refine"')
|
||||
|
||||
if input.streaming:
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
from langserve.serialization import WellKnownLCSerializer
|
||||
|
||||
@@ -269,12 +269,12 @@ curl http://${host_ip}:8008/v1/chat/completions \
|
||||
|
||||
### 3.3 Consume LLM Service
|
||||
|
||||
You can set the following model parameters according to your actual needs, such as `max_tokens`, `streaming`.
|
||||
You can set the following model parameters according to your actual needs, such as `max_tokens`, `stream`.
|
||||
|
||||
The `streaming` parameter determines the format of the data returned by the API. It will return text string with `streaming=false`, return text streaming flow with `streaming=true`.
|
||||
The `stream` parameter determines the format of the data returned by the API. It will return text string with `stream=false`, return text stream flow with `stream=true`.
|
||||
|
||||
```bash
|
||||
# non-streaming mode
|
||||
# non-stream mode
|
||||
curl http://${your_ip}:9000/v1/chat/completions \
|
||||
-X POST \
|
||||
-H 'Content-Type: application/json' \
|
||||
@@ -286,11 +286,11 @@ curl http://${your_ip}:9000/v1/chat/completions \
|
||||
"typical_p":0.95,
|
||||
"temperature":0.01,
|
||||
"repetition_penalty":1.03,
|
||||
"streaming":false
|
||||
"stream":false
|
||||
}'
|
||||
|
||||
|
||||
# streaming mode
|
||||
# stream mode
|
||||
curl http://${your_ip}:9000/v1/chat/completions \
|
||||
-X POST \
|
||||
-H 'Content-Type: application/json' \
|
||||
@@ -302,7 +302,7 @@ curl http://${your_ip}:9000/v1/chat/completions \
|
||||
"typical_p":0.95,
|
||||
"temperature":0.01,
|
||||
"repetition_penalty":1.03,
|
||||
"streaming":true
|
||||
"stream":true
|
||||
}'
|
||||
|
||||
```
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# LLM Native Microservice
|
||||
|
||||
LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-streaming formats. This streamlined approach optimizes performance on Habana hardware.
|
||||
LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-stream formats. This streamlined approach optimizes performance on Habana hardware.
|
||||
|
||||
## 🚀1. Start Microservice
|
||||
|
||||
|
||||
@@ -70,5 +70,5 @@ docker run --network host -e http_proxy=$http_proxy -e https_proxy=$https_proxy
|
||||
## Consume the Ollama Microservice
|
||||
|
||||
```bash
|
||||
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' -H 'Content-Type: application/json'
|
||||
curl http://127.0.0.1:9000/v1/chat/completions -X POST -d '{"model": "llama3", "query":"What is Deep Learning?","max_tokens":32,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"stream":true}' -H 'Content-Type: application/json'
|
||||
```
|
||||
|
||||
@@ -32,7 +32,7 @@ async def llm_generate(input: LLMParamsDoc):
|
||||
repeat_penalty=input.repetition_penalty,
|
||||
)
|
||||
# assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3`
|
||||
if input.streaming:
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
chat_response = ""
|
||||
|
||||
@@ -21,7 +21,7 @@ docker run -d -p 9000:9000 -e PREDICTIONGUARD_API_KEY=$PREDICTIONGUARD_API_KEY
|
||||
|
||||
See the [Prediction Guard docs](https://docs.predictionguard.com/) for available model options.
|
||||
|
||||
### Without streaming
|
||||
### Without stream
|
||||
|
||||
```bash
|
||||
curl -X POST http://localhost:9000/v1/chat/completions \
|
||||
@@ -37,7 +37,7 @@ curl -X POST http://localhost:9000/v1/chat/completions \
|
||||
}'
|
||||
```
|
||||
|
||||
### With streaming
|
||||
### With stream
|
||||
|
||||
```bash
|
||||
curl -N -X POST http://localhost:9000/v1/chat/completions \
|
||||
|
||||
@@ -41,7 +41,7 @@ def llm_generate(input: LLMParamsDoc):
|
||||
{"role": "user", "content": input.query},
|
||||
]
|
||||
|
||||
if input.streaming:
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
chat_response = ""
|
||||
|
||||
@@ -94,7 +94,7 @@ bash ./launch_vllm_service.sh 8008 meta-llama/Meta-Llama-3-70b hpu 8
|
||||
|
||||
### 2.3 vLLM with OpenVINO (on Intel GPU and CPU)
|
||||
|
||||
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on Intel GPU and all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (starting from Intel® UHD Graphics generation). OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.md) and can perform optimal model serving on Intel GPU and all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (starting from Intel® UHD Graphics generation). OpenVINO vLLM backend supports the following advanced vLLM features:
|
||||
|
||||
- Prefix caching (`--enable-prefix-caching`)
|
||||
- Chunked prefill (`--enable-chunked-prefill`)
|
||||
@@ -220,7 +220,7 @@ curl http://${your_ip}:9000/v1/health_check\
|
||||
User can set the following model parameters according to needs:
|
||||
|
||||
- max_tokens: Total output token
|
||||
- streaming(true/false): return text response in streaming mode or non-streaming mode
|
||||
- stream(true/false): return text response in stream mode or non-stream mode
|
||||
|
||||
```bash
|
||||
# stream mode
|
||||
|
||||
@@ -120,7 +120,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
|
||||
if logflag:
|
||||
logger.info(f"[ SearchedDoc ] final input: {new_input}")
|
||||
|
||||
if new_input.streaming:
|
||||
if new_input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
chat_response = ""
|
||||
@@ -172,7 +172,7 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche
|
||||
# use rag default template
|
||||
prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents, model_name)
|
||||
|
||||
if input.streaming:
|
||||
if input.stream:
|
||||
|
||||
async def stream_generator():
|
||||
chat_response = ""
|
||||
|
||||
@@ -70,7 +70,7 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
|
||||
f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
|
||||
)
|
||||
max_new_tokens = request.max_new_tokens
|
||||
streaming = request.streaming
|
||||
stream = request.stream
|
||||
repetition_penalty = request.repetition_penalty
|
||||
temperature = request.temperature
|
||||
top_k = request.top_k
|
||||
@@ -82,7 +82,7 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
|
||||
img_b64_str = request.image
|
||||
prompt = request.prompt
|
||||
max_new_tokens = request.max_new_tokens
|
||||
streaming = request.streaming
|
||||
stream = request.stream
|
||||
repetition_penalty = request.repetition_penalty
|
||||
temperature = request.temperature
|
||||
top_k = request.top_k
|
||||
@@ -98,13 +98,13 @@ async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc
|
||||
image = f"data:image/png;base64,{img_b64_str}"
|
||||
image_prompt = f"\n{prompt}\nASSISTANT:"
|
||||
|
||||
if streaming:
|
||||
if stream:
|
||||
|
||||
async def stream_generator():
|
||||
chat_response = ""
|
||||
text_generation = await lvm_client.text_generation(
|
||||
prompt=image_prompt,
|
||||
stream=streaming,
|
||||
stream=stream,
|
||||
max_new_tokens=max_new_tokens,
|
||||
repetition_penalty=repetition_penalty,
|
||||
temperature=temperature,
|
||||
|
||||
@@ -48,7 +48,7 @@ def llm_generate(input: RAGASParams):
|
||||
typical_p=input.typical_p,
|
||||
temperature=input.temperature,
|
||||
repetition_penalty=input.repetition_penalty,
|
||||
streaming=input.streaming,
|
||||
streaming=input.stream,
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user