From ccc8b64cc21417cfd88d9b5e9bd62e15c1e834a1 Mon Sep 17 00:00:00 2001 From: hiyouga <467089858@qq.com> Date: Fri, 7 Jun 2024 04:15:40 +0800 Subject: [PATCH] update data processors --- src/llamafactory/cli.py | 2 +- src/llamafactory/data/processors/feedback.py | 104 ++++++++++-------- src/llamafactory/data/processors/pairwise.py | 82 ++++++++------ .../data/processors/processor_utils.py | 41 ++++++- .../data/processors/supervised.py | 35 +----- .../data/processors/unsupervised.py | 65 +++++++---- 6 files changed, 190 insertions(+), 139 deletions(-) diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py index 092f4cf7..b9e734e4 100644 --- a/src/llamafactory/cli.py +++ b/src/llamafactory/cli.py @@ -72,7 +72,7 @@ def main(): elif command == Command.EXPORT: export_model() elif command == Command.TRAIN: - disable_torchrun = os.environ.get("DISABLE_TORCHRUN", "0").lower() in ["true", "1"] + disable_torchrun = os.environ.get("TORCHRUN_DISABLED", "0").lower() in ["true", "1"] if disable_torchrun and get_device_count() > 1: logger.warning("`torchrun` cannot be disabled when device count > 1.") disable_torchrun = False diff --git a/src/llamafactory/data/processors/feedback.py b/src/llamafactory/data/processors/feedback.py index dc7d817c..98d83658 100644 --- a/src/llamafactory/data/processors/feedback.py +++ b/src/llamafactory/data/processors/feedback.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple from ...extras.constants import IGNORE_INDEX from ...extras.logging import get_logger @@ -16,6 +16,55 @@ if TYPE_CHECKING: logger = get_logger(__name__) +def _encode_feedback_example( + prompt: Sequence[Dict[str, str]], + response: Sequence[Dict[str, str]], + kl_response: Sequence[Dict[str, str]], + system: Optional[str], + tools: Optional[str], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Tuple[List[int], List[int], List[int], List[int], bool]: + if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models + prompt[0]["content"] = template.image_token + prompt[0]["content"] + + if response[0]["content"]: # desired example + kto_tag = True + messages = prompt + [response[0]] + else: # undesired example + kto_tag = False + messages = prompt + [response[1]] + + if kl_response[0]["content"]: + kl_messages = prompt + [kl_response[0]] + else: + kl_messages = prompt + [kl_response[1]] + + prompt_ids, response_ids = template.encode_oneturn( + tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + _, kl_response_ids = template.encode_oneturn( + tokenizer, kl_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + + if template.efficient_eos: + response_ids += [tokenizer.eos_token_id] + kl_response_ids += [tokenizer.eos_token_id] + + if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models + image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) + prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids + + input_ids = prompt_ids + response_ids + labels = [IGNORE_INDEX] * len(prompt_ids) + response_ids + kl_input_ids = prompt_ids + kl_response_ids + kl_labels = [IGNORE_INDEX] * len(prompt_ids) + kl_response_ids + + return input_ids, labels, kl_input_ids, kl_labels, kto_tag + + def preprocess_feedback_dataset( examples: Dict[str, List[Any]], template: "Template", @@ -45,50 +94,17 @@ def preprocess_feedback_dataset( logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue - if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models - examples["prompt"][i][0]["content"] = template.image_token + examples["prompt"][i][0]["content"] - - if examples["response"][i][0]["content"]: # desired example - kto_tag = True - messages = examples["prompt"][i] + [examples["response"][i][0]] - else: # undesired example - kto_tag = False - messages = examples["prompt"][i] + [examples["response"][i][1]] - - if kl_response[i][0]["content"]: - kl_messages = examples["prompt"][i] + [kl_response[i][0]] - else: - kl_messages = examples["prompt"][i] + [kl_response[i][1]] - - prompt_ids, response_ids = template.encode_oneturn( - tokenizer, - messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, + input_ids, labels, kl_input_ids, kl_labels, kto_tag = _encode_feedback_example( + prompt=examples["prompt"][i], + response=examples["response"][i], + kl_response=kl_response[i], + system=examples["system"][i], + tools=examples["tools"][i], + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, ) - _, kl_response_ids = template.encode_oneturn( - tokenizer, - kl_messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) - - if template.efficient_eos: - response_ids += [tokenizer.eos_token_id] - kl_response_ids += [tokenizer.eos_token_id] - - if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models - image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) - prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids - - input_ids = prompt_ids + response_ids - labels = [IGNORE_INDEX] * len(prompt_ids) + response_ids - kl_input_ids = prompt_ids + kl_response_ids - kl_labels = [IGNORE_INDEX] * len(prompt_ids) + kl_response_ids model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels) diff --git a/src/llamafactory/data/processors/pairwise.py b/src/llamafactory/data/processors/pairwise.py index 8ad3979f..fe984efa 100644 --- a/src/llamafactory/data/processors/pairwise.py +++ b/src/llamafactory/data/processors/pairwise.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple from ...extras.constants import IGNORE_INDEX from ...extras.logging import get_logger @@ -16,6 +16,44 @@ if TYPE_CHECKING: logger = get_logger(__name__) +def _encode_pairwise_example( + prompt: Sequence[Dict[str, str]], + response: Sequence[Dict[str, str]], + system: Optional[str], + tools: Optional[str], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Tuple[List[int], List[int], List[int], List[int]]: + if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models + prompt[0]["content"] = template.image_token + prompt[0]["content"] + + chosen_messages = prompt + [response[0]] + rejected_messages = prompt + [response[1]] + prompt_ids, chosen_ids = template.encode_oneturn( + tokenizer, chosen_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + _, rejected_ids = template.encode_oneturn( + tokenizer, rejected_messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + + if template.efficient_eos: + chosen_ids += [tokenizer.eos_token_id] + rejected_ids += [tokenizer.eos_token_id] + + if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models + image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) + prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids + + chosen_input_ids = prompt_ids + chosen_ids + chosen_labels = [IGNORE_INDEX] * len(prompt_ids) + chosen_ids + rejected_input_ids = prompt_ids + rejected_ids + rejected_labels = [IGNORE_INDEX] * len(prompt_ids) + rejected_ids + + return chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels + + def preprocess_pairwise_dataset( examples: Dict[str, List[Any]], template: "Template", @@ -43,40 +81,16 @@ def preprocess_pairwise_dataset( logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue - if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models - examples["prompt"][i][0]["content"] = template.image_token + examples["prompt"][i][0]["content"] - - chosen_messages = examples["prompt"][i] + [examples["response"][i][0]] - rejected_messages = examples["prompt"][i] + [examples["response"][i][1]] - prompt_ids, chosen_ids = template.encode_oneturn( - tokenizer, - chosen_messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, + chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels = _encode_pairwise_example( + prompt=examples["prompt"][i], + response=examples["response"][i], + system=examples["system"][i], + tools=examples["tools"][i], + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, ) - _, rejected_ids = template.encode_oneturn( - tokenizer, - rejected_messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, - ) - - if template.efficient_eos: - chosen_ids += [tokenizer.eos_token_id] - rejected_ids += [tokenizer.eos_token_id] - - if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models - image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) - prompt_ids = [image_token_id] * getattr(processor, "image_seq_length") + prompt_ids - - chosen_input_ids = prompt_ids + chosen_ids - chosen_labels = [IGNORE_INDEX] * len(prompt_ids) + chosen_ids - rejected_input_ids = prompt_ids + rejected_ids - rejected_labels = [IGNORE_INDEX] * len(prompt_ids) + rejected_ids model_inputs["chosen_input_ids"].append(chosen_input_ids) model_inputs["chosen_attention_mask"].append([1] * len(chosen_input_ids)) model_inputs["chosen_labels"].append(chosen_labels) diff --git a/src/llamafactory/data/processors/processor_utils.py b/src/llamafactory/data/processors/processor_utils.py index abc7c4b2..9903a053 100644 --- a/src/llamafactory/data/processors/processor_utils.py +++ b/src/llamafactory/data/processors/processor_utils.py @@ -1,3 +1,4 @@ +import bisect from typing import TYPE_CHECKING, List, Sequence from ...extras.packages import is_pillow_available @@ -14,14 +15,50 @@ if TYPE_CHECKING: from transformers.image_processing_utils import BaseImageProcessor +def search_for_fit(numbers: Sequence[int], capacity: int) -> int: + r""" + Finds the index of largest number that fits into the knapsack with the given capacity. + """ + index = bisect.bisect(numbers, capacity) + return -1 if index == 0 else (index - 1) + + +def greedy_knapsack(numbers: List[int], capacity: int) -> List[List[int]]: + r""" + An efficient greedy algorithm with binary search for the knapsack problem. + """ + numbers.sort() # sort numbers in ascending order for binary search + knapsacks = [] + + while numbers: + current_knapsack = [] + remaining_capacity = capacity + + while True: + index = search_for_fit(numbers, remaining_capacity) + if index == -1: + break # no more numbers fit in this knapsack + + remaining_capacity -= numbers[index] # update the remaining capacity + current_knapsack.append(numbers.pop(index)) # add the number to knapsack + + knapsacks.append(current_knapsack) + + return knapsacks + + def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin") -> "NDArray": - # process visual inputs (currently only supports a single image) + r""" + Processes visual inputs. (currently only supports a single image) + """ image_processor: "BaseImageProcessor" = getattr(processor, "image_processor") image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255)) return image_processor(image, return_tensors="pt")["pixel_values"][0] # shape (C, H, W) def get_paligemma_token_type_ids(input_len: int, processor: "ProcessorMixin") -> List[int]: - # get paligemma token type ids for computing loss + r""" + Gets paligemma token type ids for computing loss. + """ image_seq_length = getattr(processor, "image_seq_length") return [0] * image_seq_length + [1] * (input_len - image_seq_length) diff --git a/src/llamafactory/data/processors/supervised.py b/src/llamafactory/data/processors/supervised.py index 188c9f80..19d60280 100644 --- a/src/llamafactory/data/processors/supervised.py +++ b/src/llamafactory/data/processors/supervised.py @@ -1,10 +1,9 @@ -import bisect from collections import defaultdict from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple from ...extras.constants import IGNORE_INDEX from ...extras.logging import get_logger -from .processor_utils import get_paligemma_token_type_ids, get_pixel_values +from .processor_utils import get_paligemma_token_type_ids, get_pixel_values, greedy_knapsack if TYPE_CHECKING: @@ -18,38 +17,6 @@ if TYPE_CHECKING: logger = get_logger(__name__) -def search_for_fit(numbers: Sequence[int], capacity: int) -> int: - r""" - Finds the index of largest number that fits into the knapsack with the given capacity. - """ - index = bisect.bisect(numbers, capacity) - return -1 if index == 0 else (index - 1) - - -def greedy_knapsack(numbers: List[int], capacity: int) -> List[List[int]]: - r""" - An efficient greedy algorithm with binary search for the knapsack problem. - """ - numbers.sort() # sort numbers in ascending order for binary search - knapsacks = [] - - while numbers: - current_knapsack = [] - remaining_capacity = capacity - - while True: - index = search_for_fit(numbers, remaining_capacity) - if index == -1: - break # no more numbers fit in this knapsack - - remaining_capacity -= numbers[index] # update the remaining capacity - current_knapsack.append(numbers.pop(index)) # add the number to knapsack - - knapsacks.append(current_knapsack) - - return knapsacks - - def _encode_supervised_example( prompt: Sequence[Dict[str, str]], response: Sequence[Dict[str, str]], diff --git a/src/llamafactory/data/processors/unsupervised.py b/src/llamafactory/data/processors/unsupervised.py index e00bde55..f711eeac 100644 --- a/src/llamafactory/data/processors/unsupervised.py +++ b/src/llamafactory/data/processors/unsupervised.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple from ...extras.logging import get_logger from ..data_utils import Role @@ -16,6 +16,37 @@ if TYPE_CHECKING: logger = get_logger(__name__) +def _encode_unsupervised_example( + prompt: Sequence[Dict[str, str]], + response: Sequence[Dict[str, str]], + system: Optional[str], + tools: Optional[str], + template: "Template", + tokenizer: "PreTrainedTokenizer", + processor: Optional["ProcessorMixin"], + data_args: "DataArguments", +) -> Tuple[List[int], List[int]]: + if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models + prompt[0]["content"] = template.image_token + prompt[0]["content"] + + if len(response) == 1: + messages = prompt + response + else: + messages = prompt + [{"role": Role.ASSISTANT.value, "content": ""}] + + input_ids, labels = template.encode_oneturn( + tokenizer, messages, system, tools, data_args.cutoff_len, data_args.reserved_label_len + ) + if template.efficient_eos: + labels += [tokenizer.eos_token_id] + + if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models + image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) + input_ids = [image_token_id] * getattr(processor, "image_seq_length") + input_ids + + return input_ids, labels + + def preprocess_unsupervised_dataset( examples: Dict[str, List[Any]], template: "Template", @@ -35,30 +66,16 @@ def preprocess_unsupervised_dataset( logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i])) continue - if processor is not None and not hasattr(processor, "image_seq_length"): # llava-like models - examples["prompt"][i][0]["content"] = template.image_token + examples["prompt"][i][0]["content"] - - if len(examples["response"][i]) == 1: - messages = examples["prompt"][i] + examples["response"][i] - else: - messages = examples["prompt"][i] + [{"role": Role.ASSISTANT.value, "content": ""}] - - input_ids, labels = template.encode_oneturn( - tokenizer, - messages, - examples["system"][i], - examples["tools"][i], - data_args.cutoff_len, - data_args.reserved_label_len, + input_ids, labels = _encode_unsupervised_example( + prompt=examples["prompt"][i], + response=examples["response"][i], + system=examples["system"][i], + tools=examples["tools"][i], + template=template, + tokenizer=tokenizer, + processor=processor, + data_args=data_args, ) - - if template.efficient_eos: - labels += [tokenizer.eos_token_id] - - if processor is not None and hasattr(processor, "image_seq_length"): # paligemma models - image_token_id = tokenizer.convert_tokens_to_ids(template.image_token) - input_ids = [image_token_id] * getattr(processor, "image_seq_length") + input_ids - model_inputs["input_ids"].append(input_ids) model_inputs["attention_mask"].append([1] * len(input_ids)) model_inputs["labels"].append(labels)