add resume finetuning checkpoint ut. (#646)

* add resume finetuning checkpoint ut.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add final tuned model.

---------

Co-authored-by: root <root@idc708073.jf.intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
lkk
2024-09-10 16:35:23 +08:00
committed by GitHub
parent 94eb60f168
commit c718602abf
2 changed files with 93 additions and 6 deletions

View File

@@ -3,6 +3,7 @@
import os
import random
import re
import time
import urllib.parse
import uuid
@@ -61,7 +62,7 @@ def update_job_status(job_id: FineTuningJobID):
status = "cancelled" if status == "stopped" else status
logger.info(f"Status of job {job_id} is '{status}'")
running_finetuning_jobs[job_id].status = status
if status == "finished" or status == "cancelled" or status == "failed":
if status == "succeeded" or status == "cancelled" or status == "failed":
break
time.sleep(CHECK_JOB_STATUS_INTERVAL)
@@ -190,7 +191,21 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
checkpoints = []
if os.path.exists(output_dir):
# Iterate over the contents of the directory and add an entry for each
for _ in os.listdir(output_dir): # Loop over directory contents
files = os.listdir(output_dir)
for file in files: # Loop over directory contents
file_path = os.path.join(output_dir, file)
if os.path.isdir(file_path) and file.startswith("checkpoint"):
steps = re.findall("\d+", file)[0]
checkpointsResponse = FineTuningJobCheckpoint(
id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID
created_at=int(time.time()), # Use the current timestamp
fine_tuned_model_checkpoint=file_path, # Directory path itself
fine_tuning_job_id=fine_tuning_job_id,
object="fine_tuning.job.checkpoint",
step_number=steps,
)
checkpoints.append(checkpointsResponse)
if job.status == "succeeded":
checkpointsResponse = FineTuningJobCheckpoint(
id=f"ftckpt-{uuid.uuid4()}", # Generate a unique ID
created_at=int(time.time()), # Use the current timestamp
@@ -199,7 +214,6 @@ def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
object="fine_tuning.job.checkpoint",
)
checkpoints.append(checkpointsResponse)
checkpoint_id_to_checkpoint_path[checkpointsResponse.id] = checkpointsResponse.fine_tuned_model_checkpoint
return checkpoints