refine logging code. (#559)
* add ut and refine logging code. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update microservice port. --------- Co-authored-by: root <root@idc708073.jf.intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -61,7 +61,7 @@ docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --bu
|
||||
Start docker container with below command:
|
||||
|
||||
```bash
|
||||
docker run -d --name="finetuning-server" -p 8005:8005 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
|
||||
docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
|
||||
```
|
||||
|
||||
## 2.2 Setup on Gaudi2
|
||||
@@ -81,7 +81,7 @@ Start docker container with below command:
|
||||
|
||||
```bash
|
||||
export HF_TOKEN=${your_huggingface_token}
|
||||
docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8005:8005 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
|
||||
docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
|
||||
```
|
||||
|
||||
# 🚀3. Consume Finetuning Service
|
||||
@@ -92,10 +92,10 @@ Assuming a training file `alpaca_data.json` is uploaded, it can be downloaded in
|
||||
|
||||
```bash
|
||||
# upload a training file
|
||||
curl http://${your_ip}:8005/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"
|
||||
curl http://${your_ip}:8015/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"
|
||||
|
||||
# create a finetuning job
|
||||
curl http://${your_ip}:8005/v1/fine_tuning/jobs \
|
||||
curl http://${your_ip}:8015/v1/fine_tuning/jobs \
|
||||
-X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
@@ -104,18 +104,18 @@ curl http://${your_ip}:8005/v1/fine_tuning/jobs \
|
||||
}'
|
||||
|
||||
# list finetuning jobs
|
||||
curl http://${your_ip}:8005/v1/fine_tuning/jobs -X GET
|
||||
curl http://${your_ip}:8015/v1/fine_tuning/jobs -X GET
|
||||
|
||||
# retrieve one finetuning job
|
||||
curl http://localhost:8005/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{
|
||||
curl http://localhost:8015/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{
|
||||
"fine_tuning_job_id": ${fine_tuning_job_id}}'
|
||||
|
||||
# cancel one finetuning job
|
||||
|
||||
curl http://localhost:8005/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{
|
||||
curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{
|
||||
"fine_tuning_job_id": ${fine_tuning_job_id}}'
|
||||
|
||||
# list checkpoints of a finetuning job
|
||||
curl http://${your_ip}:8005/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
|
||||
curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
|
||||
|
||||
```
|
||||
|
||||
@@ -20,20 +20,20 @@ from comps.finetuning.handlers import (
|
||||
)
|
||||
|
||||
|
||||
@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005)
|
||||
@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015)
|
||||
def create_finetuning_jobs(request: FineTuningJobsRequest, background_tasks: BackgroundTasks):
|
||||
return handle_create_finetuning_jobs(request, background_tasks)
|
||||
|
||||
|
||||
@register_microservice(
|
||||
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005, methods=["GET"]
|
||||
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
|
||||
)
|
||||
def list_finetuning_jobs():
|
||||
return handle_list_finetuning_jobs()
|
||||
|
||||
|
||||
@register_microservice(
|
||||
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8005
|
||||
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015
|
||||
)
|
||||
def retrieve_finetuning_job(request: FineTuningJobIDRequest):
|
||||
job = handle_retrieve_finetuning_job(request)
|
||||
@@ -41,7 +41,7 @@ def retrieve_finetuning_job(request: FineTuningJobIDRequest):
|
||||
|
||||
|
||||
@register_microservice(
|
||||
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8005
|
||||
name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015
|
||||
)
|
||||
def cancel_finetuning_job(request: FineTuningJobIDRequest):
|
||||
job = handle_cancel_finetuning_job(request)
|
||||
@@ -52,7 +52,7 @@ def cancel_finetuning_job(request: FineTuningJobIDRequest):
|
||||
name="opea_service@finetuning",
|
||||
endpoint="/v1/finetune/upload_training_files",
|
||||
host="0.0.0.0",
|
||||
port=8005,
|
||||
port=8015,
|
||||
)
|
||||
async def upload_training_files(
|
||||
files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
|
||||
@@ -69,7 +69,7 @@ async def upload_training_files(
|
||||
|
||||
|
||||
@register_microservice(
|
||||
name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8005
|
||||
name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015
|
||||
)
|
||||
def list_checkpoints(request: FineTuningJobIDRequest):
|
||||
checkpoints = handle_list_finetuning_checkpoints(request)
|
||||
|
||||
@@ -12,6 +12,7 @@ from fastapi import BackgroundTasks, HTTPException
|
||||
from pydantic_yaml import parse_yaml_raw_as, to_yaml_file
|
||||
from ray.job_submission import JobSubmissionClient
|
||||
|
||||
from comps import CustomLogger
|
||||
from comps.cores.proto.api_protocol import (
|
||||
FineTuningJob,
|
||||
FineTuningJobIDRequest,
|
||||
@@ -20,6 +21,8 @@ from comps.cores.proto.api_protocol import (
|
||||
)
|
||||
from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig
|
||||
|
||||
logger = CustomLogger("finetuning_handlers")
|
||||
|
||||
MODEL_CONFIG_FILE_MAP = {
|
||||
"meta-llama/Llama-2-7b-chat-hf": "./models/llama-2-7b-chat-hf.yaml",
|
||||
"mistralai/Mistral-7B-v0.1": "./models/mistral-7b-v0.1.yaml",
|
||||
@@ -50,7 +53,7 @@ def update_job_status(job_id: FineTuningJobID):
|
||||
status = str(job_status).lower()
|
||||
# Ray status "stopped" is OpenAI status "cancelled"
|
||||
status = "cancelled" if status == "stopped" else status
|
||||
print(f"Status of job {job_id} is '{status}'")
|
||||
logger.info(f"Status of job {job_id} is '{status}'")
|
||||
running_finetuning_jobs[job_id].status = status
|
||||
if status == "finished" or status == "cancelled" or status == "failed":
|
||||
break
|
||||
@@ -102,7 +105,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
|
||||
)
|
||||
finetune_config.General.output_dir = os.path.join(JOBS_PATH, job.id)
|
||||
if os.getenv("DEVICE", ""):
|
||||
print(f"specific device: {os.getenv('DEVICE')}")
|
||||
logger.info(f"specific device: {os.getenv('DEVICE')}")
|
||||
finetune_config.Training.device = os.getenv("DEVICE")
|
||||
|
||||
finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml"
|
||||
@@ -117,7 +120,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
|
||||
# Path to the local directory that contains the script.py file
|
||||
runtime_env={"working_dir": "./"},
|
||||
)
|
||||
print(f"Submitted Ray job: {ray_job_id} ...")
|
||||
logger.info(f"Submitted Ray job: {ray_job_id} ...")
|
||||
|
||||
running_finetuning_jobs[job.id] = job
|
||||
finetuning_job_to_ray_job[job.id] = ray_job_id
|
||||
@@ -169,7 +172,7 @@ async def save_content_to_local_disk(save_path: str, content):
|
||||
content = await content.read()
|
||||
fout.write(content)
|
||||
except Exception as e:
|
||||
print(f"Write file failed. Exception: {e}")
|
||||
logger.info(f"Write file failed. Exception: {e}")
|
||||
raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}")
|
||||
|
||||
|
||||
|
||||
@@ -2,11 +2,11 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
if [[ -n "$RAY_PORT" ]];then
|
||||
export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
|
||||
ray start --head --port $RAY_PORT
|
||||
else
|
||||
export RAY_ADDRESS=http://127.0.0.1:8265
|
||||
ray start --head
|
||||
export RAY_PORT=8265
|
||||
fi
|
||||
|
||||
export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
|
||||
python finetuning_service.py
|
||||
@@ -3,5 +3,4 @@
|
||||
#
|
||||
# Copyright 2023 The LLM-on-Ray Authors.
|
||||
|
||||
from .logging import logger
|
||||
from .torch_config import TorchConfig
|
||||
|
||||
@@ -7,7 +7,9 @@ import glob
|
||||
import importlib
|
||||
import os
|
||||
|
||||
from .logging import logger
|
||||
from comps import CustomLogger
|
||||
|
||||
logger = CustomLogger("llm_on_ray")
|
||||
|
||||
|
||||
def import_all_modules(basedir, prefix=None):
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Copyright 2023 The LLM-on-Ray Authors.
|
||||
|
||||
import functools
|
||||
import logging
|
||||
import logging.config
|
||||
import traceback
|
||||
|
||||
__all__ = ["logger", "get_logger"]
|
||||
|
||||
use_accelerate_log = False
|
||||
logger_name = "common"
|
||||
|
||||
logging_config = {
|
||||
"version": 1,
|
||||
"loggers": {
|
||||
"root": {"level": "INFO", "handlers": ["consoleHandler"]},
|
||||
"common": {
|
||||
"level": "INFO",
|
||||
"handlers": ["consoleHandler"],
|
||||
"qualname": "common",
|
||||
"propagate": 0,
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
"consoleHandler": {
|
||||
"class": "logging.StreamHandler",
|
||||
"level": "INFO",
|
||||
"formatter": "standardFormatter",
|
||||
},
|
||||
},
|
||||
"formatters": {
|
||||
"standardFormatter": {
|
||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
"datefmt": "",
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
if logging_config is not None:
|
||||
try:
|
||||
logging.config.dictConfig(logging_config)
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
exit(1)
|
||||
|
||||
if use_accelerate_log:
|
||||
import accelerate
|
||||
|
||||
get_logger = functools.partial(accelerate.logging.get_logger, name=logger_name)
|
||||
else:
|
||||
get_logger = functools.partial(logging.getLogger, name=logger_name)
|
||||
|
||||
logger = get_logger()
|
||||
@@ -23,10 +23,13 @@ from ray.air import FailureConfig, RunConfig
|
||||
from ray.air.config import ScalingConfig
|
||||
from ray.train.torch import TorchTrainer
|
||||
|
||||
from comps import CustomLogger
|
||||
from comps.finetuning.llm_on_ray import common
|
||||
from comps.finetuning.llm_on_ray.finetune.data_process import DataProcessor
|
||||
from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig
|
||||
|
||||
logger = CustomLogger("llm_on_ray/finetune")
|
||||
|
||||
|
||||
def adapt_transformers_to_device(config: Dict):
|
||||
device = config["Training"]["device"]
|
||||
@@ -332,10 +335,10 @@ def train_func(config: Dict[str, Any]):
|
||||
|
||||
training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator)
|
||||
|
||||
common.logger.info("train start")
|
||||
logger.info("train start")
|
||||
trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
|
||||
trainer.save_model()
|
||||
common.logger.info("train finish")
|
||||
logger.info("train finish")
|
||||
|
||||
|
||||
def get_finetune_config():
|
||||
@@ -401,7 +404,7 @@ def main(external_config=None):
|
||||
else:
|
||||
ray.init(runtime_env=runtime_env)
|
||||
|
||||
common.logger.info(f"ray available resources = {ray.available_resources()}")
|
||||
logger.info(f"ray available resources = {ray.available_resources()}")
|
||||
use_gpu = True if device == "gpu" else False
|
||||
scaling_config = ScalingConfig(
|
||||
num_workers=num_training_workers,
|
||||
|
||||
Reference in New Issue
Block a user