Merge pull request #4204 from dignfei/main
fixbug:llama3在增量预训练时应该使用<|end_of_text|>标识文本的结束
This commit is contained in:
commit
9049aab911
|
@ -12,7 +12,8 @@ def preprocess_pretrain_dataset(
|
||||||
examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
|
examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
|
||||||
) -> Dict[str, List[List[int]]]:
|
) -> Dict[str, List[List[int]]]:
|
||||||
# build grouped texts with format `X1 X2 X3 ...` if packing is enabled
|
# build grouped texts with format `X1 X2 X3 ...` if packing is enabled
|
||||||
text_examples = [messages[0]["content"] + tokenizer.eos_token for messages in examples["prompt"]]
|
eos_token = "<|end_of_text|>" if data_args.template == "llama3" else tokenizer.eos_token
|
||||||
|
text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]]
|
||||||
|
|
||||||
if not data_args.packing:
|
if not data_args.packing:
|
||||||
if data_args.template == "gemma":
|
if data_args.template == "gemma":
|
||||||
|
|
Loading…
Reference in New Issue