refine logging code. (#559)

* add ut and refine logging code. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update microservice port. --------- Co-authored-by: root <root@idc708073.jf.intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-08-28 20:49:03 +08:00
parent 3e87c3bd52
commit 5b3053f5f1
10 changed files with 32 additions and 81 deletions
--- a/comps/finetuning/README.md
+++ b/comps/finetuning/README.md
@@ -61,7 +61,7 @@ docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --bu
 Start docker container with below command:

 ```bash
-docker run -d --name="finetuning-server" -p 8005:8005 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
+docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
 ```

 ## 2.2 Setup on Gaudi2
@@ -81,7 +81,7 @@ Start docker container with below command:

 ```bash
 export HF_TOKEN=${your_huggingface_token}
-docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8005:8005 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
+docker run --runtime=habana -e HABANA_VISIBLE_DEVICES=all -p 8015:8015 -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e no_proxy=$no_proxy -e HF_TOKEN=$HF_TOKEN opea/finetuning-gaudi:latest
 ```

 # 🚀3. Consume Finetuning Service
@@ -92,10 +92,10 @@ Assuming a training file `alpaca_data.json` is uploaded, it can be downloaded in

 ```bash
 # upload a training file
-curl http://${your_ip}:8005/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"
+curl http://${your_ip}:8015/v1/finetune/upload_training_files -X POST -H "Content-Type: multipart/form-data" -F "files=@./alpaca_data.json"

 # create a finetuning job
-curl http://${your_ip}:8005/v1/fine_tuning/jobs \
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
  -X POST \
  -H "Content-Type: application/json" \
  -d '{
@@ -104,18 +104,18 @@ curl http://${your_ip}:8005/v1/fine_tuning/jobs \
  }'

 # list finetuning jobs
-curl http://${your_ip}:8005/v1/fine_tuning/jobs   -X GET
+curl http://${your_ip}:8015/v1/fine_tuning/jobs   -X GET

 # retrieve one finetuning job
-curl http://localhost:8005/v1/fine_tuning/jobs/retrieve   -X POST   -H "Content-Type: application/json"   -d '{
+curl http://localhost:8015/v1/fine_tuning/jobs/retrieve   -X POST   -H "Content-Type: application/json"   -d '{
    "fine_tuning_job_id": ${fine_tuning_job_id}}'

 # cancel one finetuning job

-curl http://localhost:8005/v1/fine_tuning/jobs/cancel   -X POST   -H "Content-Type: application/json"   -d '{
+curl http://localhost:8015/v1/fine_tuning/jobs/cancel   -X POST   -H "Content-Type: application/json"   -d '{
    "fine_tuning_job_id": ${fine_tuning_job_id}}'

 # list checkpoints of a finetuning job
-curl http://${your_ip}:8005/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
+curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'

 ```
--- a/comps/finetuning/datasets/.gitkeep
+++ b/comps/finetuning/datasets/.gitkeep
--- a/comps/finetuning/finetuning_service.py
+++ b/comps/finetuning/finetuning_service.py
@@ -20,20 +20,20 @@ from comps.finetuning.handlers import (
 )


-@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005)
+@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015)
 def create_finetuning_jobs(request: FineTuningJobsRequest, background_tasks: BackgroundTasks):
    return handle_create_finetuning_jobs(request, background_tasks)


@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8005, methods=["GET"]
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
 )
 def list_finetuning_jobs():
    return handle_list_finetuning_jobs()


@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8005
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015
 )
 def retrieve_finetuning_job(request: FineTuningJobIDRequest):
    job = handle_retrieve_finetuning_job(request)
@@ -41,7 +41,7 @@ def retrieve_finetuning_job(request: FineTuningJobIDRequest):


@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8005
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015
 )
 def cancel_finetuning_job(request: FineTuningJobIDRequest):
    job = handle_cancel_finetuning_job(request)
@@ -52,7 +52,7 @@ def cancel_finetuning_job(request: FineTuningJobIDRequest):
    name="opea_service@finetuning",
    endpoint="/v1/finetune/upload_training_files",
    host="0.0.0.0",
-    port=8005,
+    port=8015,
 )
 async def upload_training_files(
    files: Optional[Union[UploadFile, List[UploadFile]]] = File(None),
@@ -69,7 +69,7 @@ async def upload_training_files(


@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8005
+    name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015
 )
 def list_checkpoints(request: FineTuningJobIDRequest):
    checkpoints = handle_list_finetuning_checkpoints(request)
--- a/comps/finetuning/handlers.py
+++ b/comps/finetuning/handlers.py
@@ -12,6 +12,7 @@ from fastapi import BackgroundTasks, HTTPException
 from pydantic_yaml import parse_yaml_raw_as, to_yaml_file
 from ray.job_submission import JobSubmissionClient

+from comps import CustomLogger
 from comps.cores.proto.api_protocol import (
    FineTuningJob,
    FineTuningJobIDRequest,
@@ -20,6 +21,8 @@ from comps.cores.proto.api_protocol import (
 )
 from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig

+logger = CustomLogger("finetuning_handlers")
+
 MODEL_CONFIG_FILE_MAP = {
    "meta-llama/Llama-2-7b-chat-hf": "./models/llama-2-7b-chat-hf.yaml",
    "mistralai/Mistral-7B-v0.1": "./models/mistral-7b-v0.1.yaml",
@@ -50,7 +53,7 @@ def update_job_status(job_id: FineTuningJobID):
        status = str(job_status).lower()
        # Ray status "stopped" is OpenAI status "cancelled"
        status = "cancelled" if status == "stopped" else status
-        print(f"Status of job {job_id} is '{status}'")
+        logger.info(f"Status of job {job_id} is '{status}'")
        running_finetuning_jobs[job_id].status = status
        if status == "finished" or status == "cancelled" or status == "failed":
            break
@@ -102,7 +105,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
    )
    finetune_config.General.output_dir = os.path.join(JOBS_PATH, job.id)
    if os.getenv("DEVICE", ""):
-        print(f"specific device: {os.getenv('DEVICE')}")
+        logger.info(f"specific device: {os.getenv('DEVICE')}")
        finetune_config.Training.device = os.getenv("DEVICE")

    finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml"
@@ -117,7 +120,7 @@ def handle_create_finetuning_jobs(request: FineTuningJobsRequest, background_tas
        # Path to the local directory that contains the script.py file
        runtime_env={"working_dir": "./"},
    )
-    print(f"Submitted Ray job: {ray_job_id} ...")
+    logger.info(f"Submitted Ray job: {ray_job_id} ...")

    running_finetuning_jobs[job.id] = job
    finetuning_job_to_ray_job[job.id] = ray_job_id
@@ -169,7 +172,7 @@ async def save_content_to_local_disk(save_path: str, content):
                content = await content.read()
                fout.write(content)
    except Exception as e:
-        print(f"Write file failed. Exception: {e}")
+        logger.info(f"Write file failed. Exception: {e}")
        raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}")


--- a/comps/finetuning/jobs/.gitkeep
+++ b/comps/finetuning/jobs/.gitkeep
--- a/comps/finetuning/launch.sh
+++ b/comps/finetuning/launch.sh
@@ -2,11 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0

 if [[ -n "$RAY_PORT" ]];then
-    export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
    ray start --head --port $RAY_PORT
 else
-    export RAY_ADDRESS=http://127.0.0.1:8265
    ray start --head
+    export RAY_PORT=8265
 fi

+export RAY_ADDRESS=http://127.0.0.1:$RAY_PORT
 python finetuning_service.py
--- a/comps/finetuning/llm_on_ray/common/init.py
+++ b/comps/finetuning/llm_on_ray/common/init.py
@@ -3,5 +3,4 @@
 #
 # Copyright 2023 The LLM-on-Ray Authors.

-from .logging import logger
 from .torch_config import TorchConfig
--- a/comps/finetuning/llm_on_ray/common/common.py
+++ b/comps/finetuning/llm_on_ray/common/common.py
@@ -7,7 +7,9 @@ import glob
 import importlib
 import os

-from .logging import logger
+from comps import CustomLogger
+
+logger = CustomLogger("llm_on_ray")


 def import_all_modules(basedir, prefix=None):
--- a/comps/finetuning/llm_on_ray/common/logging.py
+++ b/comps/finetuning/llm_on_ray/common/logging.py
@@ -1,56 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-
-import functools
-import logging
-import logging.config
-import traceback
-
-__all__ = ["logger", "get_logger"]
-
-use_accelerate_log = False
-logger_name = "common"
-
-logging_config = {
-    "version": 1,
-    "loggers": {
-        "root": {"level": "INFO", "handlers": ["consoleHandler"]},
-        "common": {
-            "level": "INFO",
-            "handlers": ["consoleHandler"],
-            "qualname": "common",
-            "propagate": 0,
-        },
-    },
-    "handlers": {
-        "consoleHandler": {
-            "class": "logging.StreamHandler",
-            "level": "INFO",
-            "formatter": "standardFormatter",
-        },
-    },
-    "formatters": {
-        "standardFormatter": {
-            "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-            "datefmt": "",
-        }
-    },
-}
-
-if logging_config is not None:
-    try:
-        logging.config.dictConfig(logging_config)
-    except Exception:
-        traceback.print_exc()
-        exit(1)
-
-if use_accelerate_log:
-    import accelerate
-
-    get_logger = functools.partial(accelerate.logging.get_logger, name=logger_name)
-else:
-    get_logger = functools.partial(logging.getLogger, name=logger_name)
-
-logger = get_logger()
--- a/comps/finetuning/llm_on_ray/finetune/finetune.py
+++ b/comps/finetuning/llm_on_ray/finetune/finetune.py
@@ -23,10 +23,13 @@ from ray.air import FailureConfig, RunConfig
 from ray.air.config import ScalingConfig
 from ray.train.torch import TorchTrainer

+from comps import CustomLogger
 from comps.finetuning.llm_on_ray import common
 from comps.finetuning.llm_on_ray.finetune.data_process import DataProcessor
 from comps.finetuning.llm_on_ray.finetune.finetune_config import FinetuneConfig

+logger = CustomLogger("llm_on_ray/finetune")
+

 def adapt_transformers_to_device(config: Dict):
    device = config["Training"]["device"]
@@ -332,10 +335,10 @@ def train_func(config: Dict[str, Any]):

    training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator)

-    common.logger.info("train start")
+    logger.info("train start")
    trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
    trainer.save_model()
-    common.logger.info("train finish")
+    logger.info("train finish")


 def get_finetune_config():
@@ -401,7 +404,7 @@ def main(external_config=None):
        else:
            ray.init(runtime_env=runtime_env)

-    common.logger.info(f"ray available resources = {ray.available_resources()}")
+    logger.info(f"ray available resources = {ray.available_resources()}")
    use_gpu = True if device == "gpu" else False
    scaling_config = ScalingConfig(
        num_workers=num_training_workers,