[ChatQnA] Update the default LLM to llama3-8B on cpu/gpu/hpu (#1430)

Update the default LLM to llama3-8B on cpu/nvgpu/amdgpu/gaudi for docker-compose deployment to avoid the potential model serving issue or the missing chat-template issue using neural-chat-7b. Slow serving issue of neural-chat-7b on ICX: #1420 Signed-off-by: Wang, Kai Lawrence <kai.lawrence.wang@intel.com>
2025-01-20 22:47:56 +08:00
parent f11ab458d8
commit 3d3ac59bfb
25 changed files with 96 additions and 80 deletions
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -57,7 +57,7 @@ RERANK_SERVER_HOST_IP = os.getenv("RERANK_SERVER_HOST_IP", "0.0.0.0")
 RERANK_SERVER_PORT = int(os.getenv("RERANK_SERVER_PORT", 80))
 LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
 LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 80))
-LLM_MODEL = os.getenv("LLM_MODEL", "Intel/neural-chat-7b-v3-3")
+LLM_MODEL = os.getenv("LLM_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct")


 def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):