From 4ecadc35122340b3e520804270c1c1d16c696830 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Sat, 3 Feb 2024 23:14:31 +0800 Subject: [PATCH] fix #2376 --- src/llmtuner/data/preprocess.py | 6 +----- src/llmtuner/model/patcher.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/llmtuner/data/preprocess.py b/src/llmtuner/data/preprocess.py index e5cd9489..9ad8b4e2 100644 --- a/src/llmtuner/data/preprocess.py +++ b/src/llmtuner/data/preprocess.py @@ -22,12 +22,8 @@ def preprocess_pretrain_dataset( examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments" ) -> Dict[str, List[List[int]]]: # build grouped texts with format `X1 X2 X3 ...` - text_examples = [examples["prompt"][i][0]["content"] for i in range(len(examples["prompt"]))] + text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]] tokenized_examples = tokenizer(text_examples, add_special_tokens=False) - for i in range(len(tokenized_examples["input_ids"])): - tokenized_examples["input_ids"][i] += [tokenizer.eos_token_id] - tokenized_examples["attention_mask"][i] += [1] - concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()} total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]]) block_size = data_args.cutoff_len diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py index ac0cc08c..95b01f73 100644 --- a/src/llmtuner/model/patcher.py +++ b/src/llmtuner/model/patcher.py @@ -110,7 +110,7 @@ def _configure_attn_implementation(model_args: "ModelArguments", config_kwargs: logger.warning("FlashAttention2 is not installed.") config_kwargs["attn_implementation"] = None else: - config_kwargs["attn_implementation"] = "eager" + config_kwargs["attn_implementation"] = "eager" def _configure_rope(config: "PretrainedConfig", model_args: "ModelArguments", is_trainable: bool) -> None: