testrun: test 910b qwen
This commit is contained in:
parent
9469b4c256
commit
8d6f544698
|
@ -0,0 +1 @@
|
||||||
|
{"cur_time": "2024-09-18 15:28:38", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
|
@ -0,0 +1,7 @@
|
||||||
|
[2024-09-18 15:28:52,091] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to npu (auto detect)
|
||||||
|
[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
|
||||||
|
[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
|
||||||
|
09/18/2024 15:28:57 - INFO - llamafactory.hparams.parser - Process rank: 0, device: npu:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16
|
||||||
|
09/18/2024 15:28:58 - INFO - llamafactory.data.template - Add eos token: <|im_end|>
|
||||||
|
09/18/2024 15:28:58 - INFO - llamafactory.data.template - Add pad token: <|im_end|>
|
||||||
|
09/18/2024 15:28:58 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN...
|
|
@ -0,0 +1,31 @@
|
||||||
|
cutoff_len: 1024
|
||||||
|
dataset: belle_1m
|
||||||
|
ddp_timeout: 180000000
|
||||||
|
do_train: true
|
||||||
|
eval_steps: 500
|
||||||
|
eval_strategy: steps
|
||||||
|
finetuning_type: lora
|
||||||
|
fp16: true
|
||||||
|
gradient_accumulation_steps: 8
|
||||||
|
include_num_input_tokens_seen: true
|
||||||
|
include_tokens_per_second: true
|
||||||
|
learning_rate: 0.0001
|
||||||
|
logging_steps: 3
|
||||||
|
lora_target: all
|
||||||
|
lr_scheduler_type: cosine
|
||||||
|
max_samples: 10000
|
||||||
|
max_steps: 50
|
||||||
|
model_name_or_path: ../../../models/qwen
|
||||||
|
num_train_epochs: 10.0
|
||||||
|
output_dir: ./results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834
|
||||||
|
overwrite_cache: true
|
||||||
|
overwrite_output_dir: true
|
||||||
|
per_device_eval_batch_size: 2
|
||||||
|
per_device_train_batch_size: 2
|
||||||
|
plot_loss: true
|
||||||
|
preprocessing_num_workers: 16
|
||||||
|
save_steps: 500
|
||||||
|
stage: sft
|
||||||
|
template: qwen
|
||||||
|
val_size: 0.1
|
||||||
|
warmup_ratio: 0.1
|
|
@ -0,0 +1 @@
|
||||||
|
{"cur_time": "2024-09-18 15:29:45", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
|
@ -0,0 +1,7 @@
|
||||||
|
[2024-09-18 15:29:56,836] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to npu (auto detect)
|
||||||
|
[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
|
||||||
|
[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
|
||||||
|
09/18/2024 15:30:02 - INFO - llamafactory.hparams.parser - Process rank: 0, device: npu:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16
|
||||||
|
09/18/2024 15:30:03 - INFO - llamafactory.data.template - Add eos token: <|im_end|>
|
||||||
|
09/18/2024 15:30:03 - INFO - llamafactory.data.template - Add pad token: <|im_end|>
|
||||||
|
09/18/2024 15:30:03 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN...
|
|
@ -0,0 +1,31 @@
|
||||||
|
cutoff_len: 1024
|
||||||
|
dataset: belle_1m
|
||||||
|
ddp_timeout: 180000000
|
||||||
|
do_train: true
|
||||||
|
eval_steps: 500
|
||||||
|
eval_strategy: steps
|
||||||
|
finetuning_type: lora
|
||||||
|
fp16: true
|
||||||
|
gradient_accumulation_steps: 8
|
||||||
|
include_num_input_tokens_seen: true
|
||||||
|
include_tokens_per_second: true
|
||||||
|
learning_rate: 0.0001
|
||||||
|
logging_steps: 3
|
||||||
|
lora_target: all
|
||||||
|
lr_scheduler_type: cosine
|
||||||
|
max_samples: 10000
|
||||||
|
max_steps: 50
|
||||||
|
model_name_or_path: ../../../models/qwen
|
||||||
|
num_train_epochs: 10.0
|
||||||
|
output_dir: ./results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941
|
||||||
|
overwrite_cache: true
|
||||||
|
overwrite_output_dir: true
|
||||||
|
per_device_eval_batch_size: 2
|
||||||
|
per_device_train_batch_size: 2
|
||||||
|
plot_loss: true
|
||||||
|
preprocessing_num_workers: 16
|
||||||
|
save_steps: 500
|
||||||
|
stage: sft
|
||||||
|
template: qwen
|
||||||
|
val_size: 0.1
|
||||||
|
warmup_ratio: 0.1
|
|
@ -0,0 +1,63 @@
|
||||||
|
---
|
||||||
|
base_model: ../../../models/qwen
|
||||||
|
library_name: peft
|
||||||
|
license: other
|
||||||
|
tags:
|
||||||
|
- llama-factory
|
||||||
|
- lora
|
||||||
|
- generated_from_trainer
|
||||||
|
model-index:
|
||||||
|
- name: lora_sft_Qwen-7B_8_gpu_50_step_20240918153046
|
||||||
|
results: []
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
||||||
|
should probably proofread and complete it, then remove this comment. -->
|
||||||
|
|
||||||
|
# lora_sft_Qwen-7B_8_gpu_50_step_20240918153046
|
||||||
|
|
||||||
|
This model is a fine-tuned version of [../../../models/qwen](https://huggingface.co/../../../models/qwen) on the belle_1m dataset.
|
||||||
|
It achieves the following results on the evaluation set:
|
||||||
|
- Loss: 1.3353
|
||||||
|
- Num Input Tokens Seen: 132048
|
||||||
|
|
||||||
|
## Model description
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Intended uses & limitations
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Training and evaluation data
|
||||||
|
|
||||||
|
More information needed
|
||||||
|
|
||||||
|
## Training procedure
|
||||||
|
|
||||||
|
### Training hyperparameters
|
||||||
|
|
||||||
|
The following hyperparameters were used during training:
|
||||||
|
- learning_rate: 0.0001
|
||||||
|
- train_batch_size: 2
|
||||||
|
- eval_batch_size: 2
|
||||||
|
- seed: 42
|
||||||
|
- gradient_accumulation_steps: 8
|
||||||
|
- total_train_batch_size: 16
|
||||||
|
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
||||||
|
- lr_scheduler_type: cosine
|
||||||
|
- lr_scheduler_warmup_ratio: 0.1
|
||||||
|
- training_steps: 50
|
||||||
|
- mixed_precision_training: Native AMP
|
||||||
|
|
||||||
|
### Training results
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
### Framework versions
|
||||||
|
|
||||||
|
- PEFT 0.12.0
|
||||||
|
- Transformers 4.43.4
|
||||||
|
- Pytorch 2.1.0
|
||||||
|
- Datasets 2.20.0
|
||||||
|
- Tokenizers 0.19.1
|
|
@ -0,0 +1,31 @@
|
||||||
|
{
|
||||||
|
"alpha_pattern": {},
|
||||||
|
"auto_mapping": null,
|
||||||
|
"base_model_name_or_path": "../../../models/qwen",
|
||||||
|
"bias": "none",
|
||||||
|
"fan_in_fan_out": false,
|
||||||
|
"inference_mode": true,
|
||||||
|
"init_lora_weights": true,
|
||||||
|
"layer_replication": null,
|
||||||
|
"layers_pattern": null,
|
||||||
|
"layers_to_transform": null,
|
||||||
|
"loftq_config": {},
|
||||||
|
"lora_alpha": 16,
|
||||||
|
"lora_dropout": 0.0,
|
||||||
|
"megatron_config": null,
|
||||||
|
"megatron_core": "megatron.core",
|
||||||
|
"modules_to_save": null,
|
||||||
|
"peft_type": "LORA",
|
||||||
|
"r": 8,
|
||||||
|
"rank_pattern": {},
|
||||||
|
"revision": null,
|
||||||
|
"target_modules": [
|
||||||
|
"w2",
|
||||||
|
"c_proj",
|
||||||
|
"w1",
|
||||||
|
"c_attn"
|
||||||
|
],
|
||||||
|
"task_type": "CAUSAL_LM",
|
||||||
|
"use_dora": false,
|
||||||
|
"use_rslora": false
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,14 @@
|
||||||
|
{
|
||||||
|
"epoch": 0.08888888888888889,
|
||||||
|
"eval_loss": 1.3352934122085571,
|
||||||
|
"eval_runtime": 55.0087,
|
||||||
|
"eval_samples_per_second": 18.179,
|
||||||
|
"eval_steps_per_second": 9.089,
|
||||||
|
"num_input_tokens_seen": 132048,
|
||||||
|
"total_flos": 5638623387844608.0,
|
||||||
|
"train_loss": 1.4425424909591675,
|
||||||
|
"train_runtime": 174.0133,
|
||||||
|
"train_samples_per_second": 4.597,
|
||||||
|
"train_steps_per_second": 0.287,
|
||||||
|
"train_tokens_per_second": 1140.143
|
||||||
|
}
|
|
@ -0,0 +1,202 @@
|
||||||
|
---
|
||||||
|
base_model: ../../../models/qwen
|
||||||
|
library_name: peft
|
||||||
|
---
|
||||||
|
|
||||||
|
# Model Card for Model ID
|
||||||
|
|
||||||
|
<!-- Provide a quick summary of what the model is/does. -->
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Model Details
|
||||||
|
|
||||||
|
### Model Description
|
||||||
|
|
||||||
|
<!-- Provide a longer summary of what this model is. -->
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- **Developed by:** [More Information Needed]
|
||||||
|
- **Funded by [optional]:** [More Information Needed]
|
||||||
|
- **Shared by [optional]:** [More Information Needed]
|
||||||
|
- **Model type:** [More Information Needed]
|
||||||
|
- **Language(s) (NLP):** [More Information Needed]
|
||||||
|
- **License:** [More Information Needed]
|
||||||
|
- **Finetuned from model [optional]:** [More Information Needed]
|
||||||
|
|
||||||
|
### Model Sources [optional]
|
||||||
|
|
||||||
|
<!-- Provide the basic links for the model. -->
|
||||||
|
|
||||||
|
- **Repository:** [More Information Needed]
|
||||||
|
- **Paper [optional]:** [More Information Needed]
|
||||||
|
- **Demo [optional]:** [More Information Needed]
|
||||||
|
|
||||||
|
## Uses
|
||||||
|
|
||||||
|
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
||||||
|
|
||||||
|
### Direct Use
|
||||||
|
|
||||||
|
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
### Downstream Use [optional]
|
||||||
|
|
||||||
|
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
### Out-of-Scope Use
|
||||||
|
|
||||||
|
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## Bias, Risks, and Limitations
|
||||||
|
|
||||||
|
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
### Recommendations
|
||||||
|
|
||||||
|
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
||||||
|
|
||||||
|
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
||||||
|
|
||||||
|
## How to Get Started with the Model
|
||||||
|
|
||||||
|
Use the code below to get started with the model.
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## Training Details
|
||||||
|
|
||||||
|
### Training Data
|
||||||
|
|
||||||
|
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
### Training Procedure
|
||||||
|
|
||||||
|
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
||||||
|
|
||||||
|
#### Preprocessing [optional]
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
|
||||||
|
#### Training Hyperparameters
|
||||||
|
|
||||||
|
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
||||||
|
|
||||||
|
#### Speeds, Sizes, Times [optional]
|
||||||
|
|
||||||
|
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## Evaluation
|
||||||
|
|
||||||
|
<!-- This section describes the evaluation protocols and provides the results. -->
|
||||||
|
|
||||||
|
### Testing Data, Factors & Metrics
|
||||||
|
|
||||||
|
#### Testing Data
|
||||||
|
|
||||||
|
<!-- This should link to a Dataset Card if possible. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
#### Factors
|
||||||
|
|
||||||
|
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
#### Metrics
|
||||||
|
|
||||||
|
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
### Results
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
#### Summary
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Model Examination [optional]
|
||||||
|
|
||||||
|
<!-- Relevant interpretability work for the model goes here -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## Environmental Impact
|
||||||
|
|
||||||
|
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
||||||
|
|
||||||
|
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
||||||
|
|
||||||
|
- **Hardware Type:** [More Information Needed]
|
||||||
|
- **Hours used:** [More Information Needed]
|
||||||
|
- **Cloud Provider:** [More Information Needed]
|
||||||
|
- **Compute Region:** [More Information Needed]
|
||||||
|
- **Carbon Emitted:** [More Information Needed]
|
||||||
|
|
||||||
|
## Technical Specifications [optional]
|
||||||
|
|
||||||
|
### Model Architecture and Objective
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
### Compute Infrastructure
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
#### Hardware
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
#### Software
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## Citation [optional]
|
||||||
|
|
||||||
|
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
||||||
|
|
||||||
|
**BibTeX:**
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
**APA:**
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## Glossary [optional]
|
||||||
|
|
||||||
|
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## More Information [optional]
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## Model Card Authors [optional]
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
|
||||||
|
## Model Card Contact
|
||||||
|
|
||||||
|
[More Information Needed]
|
||||||
|
### Framework versions
|
||||||
|
|
||||||
|
- PEFT 0.12.0
|
|
@ -0,0 +1,31 @@
|
||||||
|
{
|
||||||
|
"alpha_pattern": {},
|
||||||
|
"auto_mapping": null,
|
||||||
|
"base_model_name_or_path": "../../../models/qwen",
|
||||||
|
"bias": "none",
|
||||||
|
"fan_in_fan_out": false,
|
||||||
|
"inference_mode": true,
|
||||||
|
"init_lora_weights": true,
|
||||||
|
"layer_replication": null,
|
||||||
|
"layers_pattern": null,
|
||||||
|
"layers_to_transform": null,
|
||||||
|
"loftq_config": {},
|
||||||
|
"lora_alpha": 16,
|
||||||
|
"lora_dropout": 0.0,
|
||||||
|
"megatron_config": null,
|
||||||
|
"megatron_core": "megatron.core",
|
||||||
|
"modules_to_save": null,
|
||||||
|
"peft_type": "LORA",
|
||||||
|
"r": 8,
|
||||||
|
"rank_pattern": {},
|
||||||
|
"revision": null,
|
||||||
|
"target_modules": [
|
||||||
|
"w2",
|
||||||
|
"c_proj",
|
||||||
|
"w1",
|
||||||
|
"c_attn"
|
||||||
|
],
|
||||||
|
"task_type": "CAUSAL_LM",
|
||||||
|
"use_dora": false,
|
||||||
|
"use_rslora": false
|
||||||
|
}
|
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": "<|im_end|>"
|
||||||
|
}
|
|
@ -0,0 +1,276 @@
|
||||||
|
# Copyright (c) Alibaba Cloud.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the license found in the
|
||||||
|
# LICENSE file in the root directory of this source tree.
|
||||||
|
|
||||||
|
"""Tokenization classes for QWen."""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import unicodedata
|
||||||
|
from typing import Collection, Dict, List, Set, Tuple, Union
|
||||||
|
|
||||||
|
import tiktoken
|
||||||
|
from transformers import PreTrainedTokenizer, AddedToken
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
|
||||||
|
|
||||||
|
PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
||||||
|
ENDOFTEXT = "<|endoftext|>"
|
||||||
|
IMSTART = "<|im_start|>"
|
||||||
|
IMEND = "<|im_end|>"
|
||||||
|
# as the default behavior is changed to allow special tokens in
|
||||||
|
# regular texts, the surface forms of special tokens need to be
|
||||||
|
# as different as possible to minimize the impact
|
||||||
|
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
||||||
|
# changed to use actual index to avoid misconfiguration with vocabulary expansion
|
||||||
|
SPECIAL_START_ID = 151643
|
||||||
|
SPECIAL_TOKENS = tuple(
|
||||||
|
enumerate(
|
||||||
|
(
|
||||||
|
(
|
||||||
|
ENDOFTEXT,
|
||||||
|
IMSTART,
|
||||||
|
IMEND,
|
||||||
|
)
|
||||||
|
+ EXTRAS
|
||||||
|
),
|
||||||
|
start=SPECIAL_START_ID,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
||||||
|
with open(tiktoken_bpe_file, "rb") as f:
|
||||||
|
contents = f.read()
|
||||||
|
return {
|
||||||
|
base64.b64decode(token): int(rank)
|
||||||
|
for token, rank in (line.split() for line in contents.splitlines() if line)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class QWenTokenizer(PreTrainedTokenizer):
|
||||||
|
"""QWen tokenizer."""
|
||||||
|
|
||||||
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_file,
|
||||||
|
errors="replace",
|
||||||
|
extra_vocab_file=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
# how to handle errors in decoding UTF-8 byte sequences
|
||||||
|
# use ignore if you are in streaming inference
|
||||||
|
self.errors = errors
|
||||||
|
|
||||||
|
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
|
||||||
|
self.special_tokens = {
|
||||||
|
token: index
|
||||||
|
for index, token in SPECIAL_TOKENS
|
||||||
|
}
|
||||||
|
|
||||||
|
# try load extra vocab from file
|
||||||
|
if extra_vocab_file is not None:
|
||||||
|
used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
|
||||||
|
extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
|
||||||
|
for token, index in extra_mergeable_ranks.items():
|
||||||
|
if token in self.mergeable_ranks:
|
||||||
|
logger.info(f"extra token {token} exists, skipping")
|
||||||
|
continue
|
||||||
|
if index in used_ids:
|
||||||
|
logger.info(f'the index {index} for extra token {token} exists, skipping')
|
||||||
|
continue
|
||||||
|
self.mergeable_ranks[token] = index
|
||||||
|
# the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
|
||||||
|
|
||||||
|
enc = tiktoken.Encoding(
|
||||||
|
"Qwen",
|
||||||
|
pat_str=PAT_STR,
|
||||||
|
mergeable_ranks=self.mergeable_ranks,
|
||||||
|
special_tokens=self.special_tokens,
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
|
||||||
|
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
|
||||||
|
|
||||||
|
self.decoder = {
|
||||||
|
v: k for k, v in self.mergeable_ranks.items()
|
||||||
|
} # type: dict[int, bytes|str]
|
||||||
|
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
||||||
|
|
||||||
|
self.tokenizer = enc # type: tiktoken.Encoding
|
||||||
|
|
||||||
|
self.eod_id = self.tokenizer.eot_token
|
||||||
|
self.im_start_id = self.special_tokens[IMSTART]
|
||||||
|
self.im_end_id = self.special_tokens[IMEND]
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
# for pickle lovers
|
||||||
|
state = self.__dict__.copy()
|
||||||
|
del state["tokenizer"]
|
||||||
|
return state
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
# tokenizer is not python native; don't pass it; rebuild it
|
||||||
|
self.__dict__.update(state)
|
||||||
|
enc = tiktoken.Encoding(
|
||||||
|
"Qwen",
|
||||||
|
pat_str=PAT_STR,
|
||||||
|
mergeable_ranks=self.mergeable_ranks,
|
||||||
|
special_tokens=self.special_tokens,
|
||||||
|
)
|
||||||
|
self.tokenizer = enc
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return self.tokenizer.n_vocab
|
||||||
|
|
||||||
|
def get_vocab(self) -> Dict[bytes, int]:
|
||||||
|
return self.mergeable_ranks
|
||||||
|
|
||||||
|
def convert_tokens_to_ids(
|
||||||
|
self, tokens: Union[bytes, str, List[Union[bytes, str]]]
|
||||||
|
) -> List[int]:
|
||||||
|
ids = []
|
||||||
|
if isinstance(tokens, (str, bytes)):
|
||||||
|
if tokens in self.special_tokens:
|
||||||
|
return self.special_tokens[tokens]
|
||||||
|
else:
|
||||||
|
return self.mergeable_ranks.get(tokens)
|
||||||
|
for token in tokens:
|
||||||
|
if token in self.special_tokens:
|
||||||
|
ids.append(self.special_tokens[token])
|
||||||
|
else:
|
||||||
|
ids.append(self.mergeable_ranks.get(token))
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def _add_tokens(
|
||||||
|
self,
|
||||||
|
new_tokens: Union[List[str], List[AddedToken]],
|
||||||
|
special_tokens: bool = False,
|
||||||
|
) -> int:
|
||||||
|
if not special_tokens and new_tokens:
|
||||||
|
raise ValueError("Adding regular tokens is not supported")
|
||||||
|
for token in new_tokens:
|
||||||
|
surface_form = token.content if isinstance(token, AddedToken) else token
|
||||||
|
if surface_form not in SPECIAL_TOKENS_SET:
|
||||||
|
raise ValueError("Adding unknown special tokens is not supported")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
||||||
|
"""
|
||||||
|
Save only the vocabulary of the tokenizer (vocabulary).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Tuple(str)`: Paths to the files saved.
|
||||||
|
"""
|
||||||
|
file_path = os.path.join(save_directory, "qwen.tiktoken")
|
||||||
|
with open(file_path, "w", encoding="utf8") as w:
|
||||||
|
for k, v in self.mergeable_ranks.items():
|
||||||
|
line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
|
||||||
|
w.write(line)
|
||||||
|
return (file_path,)
|
||||||
|
|
||||||
|
def tokenize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
allowed_special: Union[Set, str] = "all",
|
||||||
|
disallowed_special: Union[Collection, str] = (),
|
||||||
|
**kwargs,
|
||||||
|
) -> List[Union[bytes, str]]:
|
||||||
|
"""
|
||||||
|
Converts a string in a sequence of tokens.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (`str`):
|
||||||
|
The sequence to be encoded.
|
||||||
|
allowed_special (`Literal["all"]` or `set`):
|
||||||
|
The surface forms of the tokens to be encoded as special tokens in regular texts.
|
||||||
|
Default to "all".
|
||||||
|
disallowed_special (`Literal["all"]` or `Collection`):
|
||||||
|
The surface forms of the tokens that should not be in regular texts and trigger errors.
|
||||||
|
Default to an empty tuple.
|
||||||
|
|
||||||
|
kwargs (additional keyword arguments, *optional*):
|
||||||
|
Will be passed to the underlying model specific encode method.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[bytes|str]`: The list of tokens.
|
||||||
|
"""
|
||||||
|
tokens = []
|
||||||
|
text = unicodedata.normalize("NFC", text)
|
||||||
|
|
||||||
|
# this implementation takes a detour: text -> token id -> token surface forms
|
||||||
|
for t in self.tokenizer.encode(
|
||||||
|
text, allowed_special=allowed_special, disallowed_special=disallowed_special
|
||||||
|
):
|
||||||
|
tokens.append(self.decoder[t])
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
|
||||||
|
"""
|
||||||
|
Converts a sequence of tokens in a single string.
|
||||||
|
"""
|
||||||
|
text = ""
|
||||||
|
temp = b""
|
||||||
|
for t in tokens:
|
||||||
|
if isinstance(t, str):
|
||||||
|
if temp:
|
||||||
|
text += temp.decode("utf-8", errors=self.errors)
|
||||||
|
temp = b""
|
||||||
|
text += t
|
||||||
|
elif isinstance(t, bytes):
|
||||||
|
temp += t
|
||||||
|
else:
|
||||||
|
raise TypeError("token should only be of type types or str")
|
||||||
|
if temp:
|
||||||
|
text += temp.decode("utf-8", errors=self.errors)
|
||||||
|
return text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
return self.tokenizer.n_vocab
|
||||||
|
|
||||||
|
def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
|
||||||
|
"""Converts an id to a token, special tokens included"""
|
||||||
|
if index in self.decoder:
|
||||||
|
return self.decoder[index]
|
||||||
|
raise ValueError("unknown ids")
|
||||||
|
|
||||||
|
def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
|
||||||
|
"""Converts a token to an id using the vocab, special tokens included"""
|
||||||
|
if token in self.special_tokens:
|
||||||
|
return self.special_tokens[token]
|
||||||
|
if token in self.mergeable_ranks:
|
||||||
|
return self.mergeable_ranks[token]
|
||||||
|
raise ValueError("unknown token")
|
||||||
|
|
||||||
|
def _tokenize(self, text: str, **kwargs):
|
||||||
|
"""
|
||||||
|
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
|
||||||
|
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
|
||||||
|
|
||||||
|
Do NOT take care of added tokens.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _decode(
|
||||||
|
self,
|
||||||
|
token_ids: Union[int, List[int]],
|
||||||
|
skip_special_tokens: bool = False,
|
||||||
|
errors: str = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> str:
|
||||||
|
if isinstance(token_ids, int):
|
||||||
|
token_ids = [token_ids]
|
||||||
|
if skip_special_tokens:
|
||||||
|
token_ids = [i for i in token_ids if i < self.eod_id]
|
||||||
|
return self.tokenizer.decode(token_ids, errors=errors or self.errors)
|
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
"added_tokens_decoder": {},
|
||||||
|
"auto_map": {
|
||||||
|
"AutoTokenizer": [
|
||||||
|
"tokenization_qwen.QWenTokenizer",
|
||||||
|
null
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
|
||||||
|
"clean_up_tokenization_spaces": true,
|
||||||
|
"eos_token": "<|im_end|>",
|
||||||
|
"model_max_length": 32768,
|
||||||
|
"pad_token": "<|im_end|>",
|
||||||
|
"padding_side": "right",
|
||||||
|
"split_special_tokens": false,
|
||||||
|
"tokenizer_class": "QWenTokenizer"
|
||||||
|
}
|
|
@ -0,0 +1,161 @@
|
||||||
|
{
|
||||||
|
"best_metric": null,
|
||||||
|
"best_model_checkpoint": null,
|
||||||
|
"epoch": 0.08888888888888889,
|
||||||
|
"eval_steps": 500,
|
||||||
|
"global_step": 50,
|
||||||
|
"is_hyper_param_search": false,
|
||||||
|
"is_local_process_zero": true,
|
||||||
|
"is_world_process_zero": true,
|
||||||
|
"log_history": [
|
||||||
|
{
|
||||||
|
"epoch": 0.005333333333333333,
|
||||||
|
"grad_norm": 0.8999722599983215,
|
||||||
|
"learning_rate": 4e-05,
|
||||||
|
"loss": 1.5189,
|
||||||
|
"num_input_tokens_seen": 9808,
|
||||||
|
"step": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.010666666666666666,
|
||||||
|
"grad_norm": NaN,
|
||||||
|
"learning_rate": 6e-05,
|
||||||
|
"loss": 1.5504,
|
||||||
|
"num_input_tokens_seen": 19312,
|
||||||
|
"step": 6
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.016,
|
||||||
|
"grad_norm": 0.9268227219581604,
|
||||||
|
"learning_rate": 9.987820251299122e-05,
|
||||||
|
"loss": 1.5661,
|
||||||
|
"num_input_tokens_seen": 29232,
|
||||||
|
"step": 9
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.021333333333333333,
|
||||||
|
"grad_norm": 1.1588999032974243,
|
||||||
|
"learning_rate": 9.806308479691595e-05,
|
||||||
|
"loss": 1.7033,
|
||||||
|
"num_input_tokens_seen": 37984,
|
||||||
|
"step": 12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.02666666666666667,
|
||||||
|
"grad_norm": 1.0571134090423584,
|
||||||
|
"learning_rate": 9.567727288213005e-05,
|
||||||
|
"loss": 1.4225,
|
||||||
|
"num_input_tokens_seen": 44592,
|
||||||
|
"step": 15
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.032,
|
||||||
|
"grad_norm": 1.720107913017273,
|
||||||
|
"learning_rate": 9.24024048078213e-05,
|
||||||
|
"loss": 1.4217,
|
||||||
|
"num_input_tokens_seen": 52400,
|
||||||
|
"step": 18
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.037333333333333336,
|
||||||
|
"grad_norm": 0.930574893951416,
|
||||||
|
"learning_rate": 8.596699001693255e-05,
|
||||||
|
"loss": 1.2793,
|
||||||
|
"num_input_tokens_seen": 60320,
|
||||||
|
"step": 21
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.042666666666666665,
|
||||||
|
"grad_norm": 1.6979925632476807,
|
||||||
|
"learning_rate": 7.795964517353735e-05,
|
||||||
|
"loss": 1.4875,
|
||||||
|
"num_input_tokens_seen": 67024,
|
||||||
|
"step": 24
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.048,
|
||||||
|
"grad_norm": 2.2298834323883057,
|
||||||
|
"learning_rate": 6.873032967079561e-05,
|
||||||
|
"loss": 1.2446,
|
||||||
|
"num_input_tokens_seen": 73776,
|
||||||
|
"step": 27
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.05333333333333334,
|
||||||
|
"grad_norm": 1.6609553098678589,
|
||||||
|
"learning_rate": 5.868240888334653e-05,
|
||||||
|
"loss": 1.4691,
|
||||||
|
"num_input_tokens_seen": 82592,
|
||||||
|
"step": 30
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.058666666666666666,
|
||||||
|
"grad_norm": 1.1659108400344849,
|
||||||
|
"learning_rate": 4.825502516487497e-05,
|
||||||
|
"loss": 1.4451,
|
||||||
|
"num_input_tokens_seen": 90512,
|
||||||
|
"step": 33
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.064,
|
||||||
|
"grad_norm": 1.2636826038360596,
|
||||||
|
"learning_rate": 3.790390522001662e-05,
|
||||||
|
"loss": 1.4139,
|
||||||
|
"num_input_tokens_seen": 96848,
|
||||||
|
"step": 36
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.06933333333333333,
|
||||||
|
"grad_norm": 3.8678996562957764,
|
||||||
|
"learning_rate": 2.8081442660546125e-05,
|
||||||
|
"loss": 1.3205,
|
||||||
|
"num_input_tokens_seen": 103728,
|
||||||
|
"step": 39
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.07466666666666667,
|
||||||
|
"grad_norm": 0.6766985654830933,
|
||||||
|
"learning_rate": 1.9216926233717085e-05,
|
||||||
|
"loss": 1.2969,
|
||||||
|
"num_input_tokens_seen": 112160,
|
||||||
|
"step": 42
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.08,
|
||||||
|
"grad_norm": 0.7236246466636658,
|
||||||
|
"learning_rate": 1.1697777844051105e-05,
|
||||||
|
"loss": 1.5026,
|
||||||
|
"num_input_tokens_seen": 117984,
|
||||||
|
"step": 45
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.08533333333333333,
|
||||||
|
"grad_norm": 0.8583828806877136,
|
||||||
|
"learning_rate": 5.852620357053651e-06,
|
||||||
|
"loss": 1.3583,
|
||||||
|
"num_input_tokens_seen": 126624,
|
||||||
|
"step": 48
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"logging_steps": 3,
|
||||||
|
"max_steps": 50,
|
||||||
|
"num_input_tokens_seen": 132048,
|
||||||
|
"num_train_epochs": 1,
|
||||||
|
"save_steps": 500,
|
||||||
|
"stateful_callbacks": {
|
||||||
|
"TrainerControl": {
|
||||||
|
"args": {
|
||||||
|
"should_epoch_stop": false,
|
||||||
|
"should_evaluate": false,
|
||||||
|
"should_log": false,
|
||||||
|
"should_save": true,
|
||||||
|
"should_training_stop": true
|
||||||
|
},
|
||||||
|
"attributes": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"total_flos": 5638623387844608.0,
|
||||||
|
"train_batch_size": 2,
|
||||||
|
"trial_name": null,
|
||||||
|
"trial_params": null
|
||||||
|
}
|
Binary file not shown.
|
@ -0,0 +1,8 @@
|
||||||
|
{
|
||||||
|
"epoch": 0.08888888888888889,
|
||||||
|
"eval_loss": 1.3352934122085571,
|
||||||
|
"eval_runtime": 55.0087,
|
||||||
|
"eval_samples_per_second": 18.179,
|
||||||
|
"eval_steps_per_second": 9.089,
|
||||||
|
"num_input_tokens_seen": 132048
|
||||||
|
}
|
|
@ -0,0 +1,9 @@
|
||||||
|
{"cur_time": "2024-09-18 15:30:50", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
||||||
|
{"cur_time": "2024-09-18 15:31:52", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
||||||
|
{"cur_time": "2024-09-18 15:32:54", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
||||||
|
{"cur_time": "2024-09-18 15:33:57", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
||||||
|
{"cur_time": "2024-09-18 15:34:59", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
||||||
|
{"cur_time": "2024-09-18 15:36:01", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
||||||
|
{"cur_time": "2024-09-18 15:37:04", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
||||||
|
{"cur_time": "2024-09-18 15:38:06", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
||||||
|
{"cur_time": "2024-09-18 15:39:08", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
|
|
@ -0,0 +1,75 @@
|
||||||
|
[2024-09-18 15:31:02,784] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to npu (auto detect)
|
||||||
|
[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
|
||||||
|
[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
|
||||||
|
09/18/2024 15:31:08 - INFO - llamafactory.hparams.parser - Process rank: 0, device: npu:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16
|
||||||
|
09/18/2024 15:31:09 - INFO - llamafactory.data.template - Add eos token: <|im_end|>
|
||||||
|
09/18/2024 15:31:09 - INFO - llamafactory.data.template - Add pad token: <|im_end|>
|
||||||
|
09/18/2024 15:31:09 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN...
|
||||||
|
training example:
|
||||||
|
input_ids:
|
||||||
|
[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 104317, 89012, 22382, 106096, 64471, 101137, 72881, 102648, 46448, 1773, 62244, 107132, 37945, 99553, 25177, 101898, 8997, 100431, 99639, 113773, 9370, 111749, 25, 330, 100012, 105435, 99487, 100220, 3837, 104817, 44063, 99553, 102322, 20074, 33108, 116993, 3837, 23031, 104022, 100147, 101313, 1773, 698, 151645, 198, 151644, 77091, 198, 99487, 111749, 101137, 72881, 102648, 46448, 1773, 151645]
|
||||||
|
inputs:
|
||||||
|
<|im_start|>system
|
||||||
|
You are a helpful assistant.<|im_end|>
|
||||||
|
<|im_start|>user
|
||||||
|
判断给定的文章是否符合语法规则。如果不符合,请提供修改建议。
|
||||||
|
下面是一篇文章的开头: "为了探讨这个主题,本文将提供一系列数据和实例,以证明这一观点。"
|
||||||
|
<|im_end|>
|
||||||
|
<|im_start|>assistant
|
||||||
|
这个开头符合语法规则。<|im_end|>
|
||||||
|
label_ids:
|
||||||
|
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 99487, 111749, 101137, 72881, 102648, 46448, 1773, 151645]
|
||||||
|
labels:
|
||||||
|
这个开头符合语法规则。<|im_end|>
|
||||||
|
09/18/2024 15:35:27 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
|
||||||
|
09/18/2024 15:35:27 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation.
|
||||||
|
09/18/2024 15:35:27 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
|
||||||
|
09/18/2024 15:35:27 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA
|
||||||
|
09/18/2024 15:35:27 - INFO - llamafactory.model.model_utils.misc - Found linear modules: w2,w1,c_proj,c_attn
|
||||||
|
09/18/2024 15:35:28 - INFO - llamafactory.model.loader - trainable params: 17,891,328 || all params: 7,739,215,872 || trainable%: 0.2312
|
||||||
|
Gradient overflow. Skipping step
|
||||||
|
Loss scaler reducing loss scale to 32768.0
|
||||||
|
{'loss': 1.5189, 'grad_norm': 0.8999722599983215, 'learning_rate': 4e-05, 'epoch': 0.01, 'num_input_tokens_seen': 9808}
|
||||||
|
Gradient overflow. Skipping step
|
||||||
|
Loss scaler reducing loss scale to 16384.0
|
||||||
|
Gradient overflow. Skipping step
|
||||||
|
Loss scaler reducing loss scale to 8192.0
|
||||||
|
{'loss': 1.5504, 'grad_norm': nan, 'learning_rate': 6e-05, 'epoch': 0.01, 'num_input_tokens_seen': 19312}
|
||||||
|
{'loss': 1.5661, 'grad_norm': 0.9268227219581604, 'learning_rate': 9.987820251299122e-05, 'epoch': 0.02, 'num_input_tokens_seen': 29232}
|
||||||
|
{'loss': 1.7033, 'grad_norm': 1.1588999032974243, 'learning_rate': 9.806308479691595e-05, 'epoch': 0.02, 'num_input_tokens_seen': 37984}
|
||||||
|
Gradient overflow. Skipping step
|
||||||
|
Loss scaler reducing loss scale to 4096.0
|
||||||
|
{'loss': 1.4225, 'grad_norm': 1.0571134090423584, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.03, 'num_input_tokens_seen': 44592}
|
||||||
|
Gradient overflow. Skipping step
|
||||||
|
Loss scaler reducing loss scale to 2048.0
|
||||||
|
{'loss': 1.4217, 'grad_norm': 1.720107913017273, 'learning_rate': 9.24024048078213e-05, 'epoch': 0.03, 'num_input_tokens_seen': 52400}
|
||||||
|
{'loss': 1.2793, 'grad_norm': 0.930574893951416, 'learning_rate': 8.596699001693255e-05, 'epoch': 0.04, 'num_input_tokens_seen': 60320}
|
||||||
|
{'loss': 1.4875, 'grad_norm': 1.6979925632476807, 'learning_rate': 7.795964517353735e-05, 'epoch': 0.04, 'num_input_tokens_seen': 67024}
|
||||||
|
{'loss': 1.2446, 'grad_norm': 2.2298834323883057, 'learning_rate': 6.873032967079561e-05, 'epoch': 0.05, 'num_input_tokens_seen': 73776}
|
||||||
|
{'loss': 1.4691, 'grad_norm': 1.6609553098678589, 'learning_rate': 5.868240888334653e-05, 'epoch': 0.05, 'num_input_tokens_seen': 82592}
|
||||||
|
{'loss': 1.4451, 'grad_norm': 1.1659108400344849, 'learning_rate': 4.825502516487497e-05, 'epoch': 0.06, 'num_input_tokens_seen': 90512}
|
||||||
|
{'loss': 1.4139, 'grad_norm': 1.2636826038360596, 'learning_rate': 3.790390522001662e-05, 'epoch': 0.06, 'num_input_tokens_seen': 96848}
|
||||||
|
{'loss': 1.3205, 'grad_norm': 3.8678996562957764, 'learning_rate': 2.8081442660546125e-05, 'epoch': 0.07, 'num_input_tokens_seen': 103728}
|
||||||
|
{'loss': 1.2969, 'grad_norm': 0.6766985654830933, 'learning_rate': 1.9216926233717085e-05, 'epoch': 0.07, 'num_input_tokens_seen': 112160}
|
||||||
|
{'loss': 1.5026, 'grad_norm': 0.7236246466636658, 'learning_rate': 1.1697777844051105e-05, 'epoch': 0.08, 'num_input_tokens_seen': 117984}
|
||||||
|
{'loss': 1.3583, 'grad_norm': 0.8583828806877136, 'learning_rate': 5.852620357053651e-06, 'epoch': 0.09, 'num_input_tokens_seen': 126624}
|
||||||
|
{'train_runtime': 174.0133, 'train_samples_per_second': 4.597, 'train_steps_per_second': 0.287, 'train_tokens_per_second': 1140.143, 'train_loss': 1.4425424909591675, 'epoch': 0.09, 'num_input_tokens_seen': 132048}
|
||||||
|
***** train metrics *****
|
||||||
|
epoch = 0.0889
|
||||||
|
num_input_tokens_seen = 132048
|
||||||
|
total_flos = 5251377GF
|
||||||
|
train_loss = 1.4425
|
||||||
|
train_runtime = 0:02:54.01
|
||||||
|
train_samples_per_second = 4.597
|
||||||
|
train_steps_per_second = 0.287
|
||||||
|
train_tokens_per_second = 1140.143
|
||||||
|
Figure saved at: ./results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/training_loss.png
|
||||||
|
09/18/2024 15:38:23 - WARNING - llamafactory.extras.ploting - No metric eval_loss to plot.
|
||||||
|
09/18/2024 15:38:23 - WARNING - llamafactory.extras.ploting - No metric eval_accuracy to plot.
|
||||||
|
***** eval metrics *****
|
||||||
|
epoch = 0.0889
|
||||||
|
eval_loss = 1.3353
|
||||||
|
eval_runtime = 0:00:55.00
|
||||||
|
eval_samples_per_second = 18.179
|
||||||
|
eval_steps_per_second = 9.089
|
||||||
|
num_input_tokens_seen = 132048
|
|
@ -0,0 +1,31 @@
|
||||||
|
cutoff_len: 1024
|
||||||
|
dataset: belle_1m
|
||||||
|
ddp_timeout: 180000000
|
||||||
|
do_train: true
|
||||||
|
eval_steps: 500
|
||||||
|
eval_strategy: steps
|
||||||
|
finetuning_type: lora
|
||||||
|
fp16: true
|
||||||
|
gradient_accumulation_steps: 8
|
||||||
|
include_num_input_tokens_seen: true
|
||||||
|
include_tokens_per_second: true
|
||||||
|
learning_rate: 0.0001
|
||||||
|
logging_steps: 3
|
||||||
|
lora_target: all
|
||||||
|
lr_scheduler_type: cosine
|
||||||
|
max_samples: 10000
|
||||||
|
max_steps: 50
|
||||||
|
model_name_or_path: ../../../models/qwen
|
||||||
|
num_train_epochs: 10.0
|
||||||
|
output_dir: ./results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046
|
||||||
|
overwrite_cache: true
|
||||||
|
overwrite_output_dir: true
|
||||||
|
per_device_eval_batch_size: 2
|
||||||
|
per_device_train_batch_size: 2
|
||||||
|
plot_loss: true
|
||||||
|
preprocessing_num_workers: 16
|
||||||
|
save_steps: 500
|
||||||
|
stage: sft
|
||||||
|
template: qwen
|
||||||
|
val_size: 0.1
|
||||||
|
warmup_ratio: 0.1
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"eos_token": {
|
||||||
|
"content": "<|im_end|>",
|
||||||
|
"lstrip": false,
|
||||||
|
"normalized": false,
|
||||||
|
"rstrip": false,
|
||||||
|
"single_word": false
|
||||||
|
},
|
||||||
|
"pad_token": "<|im_end|>"
|
||||||
|
}
|
|
@ -0,0 +1,276 @@
|
||||||
|
# Copyright (c) Alibaba Cloud.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the license found in the
|
||||||
|
# LICENSE file in the root directory of this source tree.
|
||||||
|
|
||||||
|
"""Tokenization classes for QWen."""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import unicodedata
|
||||||
|
from typing import Collection, Dict, List, Set, Tuple, Union
|
||||||
|
|
||||||
|
import tiktoken
|
||||||
|
from transformers import PreTrainedTokenizer, AddedToken
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
|
||||||
|
|
||||||
|
PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
|
||||||
|
ENDOFTEXT = "<|endoftext|>"
|
||||||
|
IMSTART = "<|im_start|>"
|
||||||
|
IMEND = "<|im_end|>"
|
||||||
|
# as the default behavior is changed to allow special tokens in
|
||||||
|
# regular texts, the surface forms of special tokens need to be
|
||||||
|
# as different as possible to minimize the impact
|
||||||
|
EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
|
||||||
|
# changed to use actual index to avoid misconfiguration with vocabulary expansion
|
||||||
|
SPECIAL_START_ID = 151643
|
||||||
|
SPECIAL_TOKENS = tuple(
|
||||||
|
enumerate(
|
||||||
|
(
|
||||||
|
(
|
||||||
|
ENDOFTEXT,
|
||||||
|
IMSTART,
|
||||||
|
IMEND,
|
||||||
|
)
|
||||||
|
+ EXTRAS
|
||||||
|
),
|
||||||
|
start=SPECIAL_START_ID,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
|
||||||
|
with open(tiktoken_bpe_file, "rb") as f:
|
||||||
|
contents = f.read()
|
||||||
|
return {
|
||||||
|
base64.b64decode(token): int(rank)
|
||||||
|
for token, rank in (line.split() for line in contents.splitlines() if line)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class QWenTokenizer(PreTrainedTokenizer):
|
||||||
|
"""QWen tokenizer."""
|
||||||
|
|
||||||
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocab_file,
|
||||||
|
errors="replace",
|
||||||
|
extra_vocab_file=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
# how to handle errors in decoding UTF-8 byte sequences
|
||||||
|
# use ignore if you are in streaming inference
|
||||||
|
self.errors = errors
|
||||||
|
|
||||||
|
self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) # type: Dict[bytes, int]
|
||||||
|
self.special_tokens = {
|
||||||
|
token: index
|
||||||
|
for index, token in SPECIAL_TOKENS
|
||||||
|
}
|
||||||
|
|
||||||
|
# try load extra vocab from file
|
||||||
|
if extra_vocab_file is not None:
|
||||||
|
used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
|
||||||
|
extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
|
||||||
|
for token, index in extra_mergeable_ranks.items():
|
||||||
|
if token in self.mergeable_ranks:
|
||||||
|
logger.info(f"extra token {token} exists, skipping")
|
||||||
|
continue
|
||||||
|
if index in used_ids:
|
||||||
|
logger.info(f'the index {index} for extra token {token} exists, skipping')
|
||||||
|
continue
|
||||||
|
self.mergeable_ranks[token] = index
|
||||||
|
# the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
|
||||||
|
|
||||||
|
enc = tiktoken.Encoding(
|
||||||
|
"Qwen",
|
||||||
|
pat_str=PAT_STR,
|
||||||
|
mergeable_ranks=self.mergeable_ranks,
|
||||||
|
special_tokens=self.special_tokens,
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
|
||||||
|
), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
|
||||||
|
|
||||||
|
self.decoder = {
|
||||||
|
v: k for k, v in self.mergeable_ranks.items()
|
||||||
|
} # type: dict[int, bytes|str]
|
||||||
|
self.decoder.update({v: k for k, v in self.special_tokens.items()})
|
||||||
|
|
||||||
|
self.tokenizer = enc # type: tiktoken.Encoding
|
||||||
|
|
||||||
|
self.eod_id = self.tokenizer.eot_token
|
||||||
|
self.im_start_id = self.special_tokens[IMSTART]
|
||||||
|
self.im_end_id = self.special_tokens[IMEND]
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
# for pickle lovers
|
||||||
|
state = self.__dict__.copy()
|
||||||
|
del state["tokenizer"]
|
||||||
|
return state
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
# tokenizer is not python native; don't pass it; rebuild it
|
||||||
|
self.__dict__.update(state)
|
||||||
|
enc = tiktoken.Encoding(
|
||||||
|
"Qwen",
|
||||||
|
pat_str=PAT_STR,
|
||||||
|
mergeable_ranks=self.mergeable_ranks,
|
||||||
|
special_tokens=self.special_tokens,
|
||||||
|
)
|
||||||
|
self.tokenizer = enc
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return self.tokenizer.n_vocab
|
||||||
|
|
||||||
|
def get_vocab(self) -> Dict[bytes, int]:
|
||||||
|
return self.mergeable_ranks
|
||||||
|
|
||||||
|
def convert_tokens_to_ids(
|
||||||
|
self, tokens: Union[bytes, str, List[Union[bytes, str]]]
|
||||||
|
) -> List[int]:
|
||||||
|
ids = []
|
||||||
|
if isinstance(tokens, (str, bytes)):
|
||||||
|
if tokens in self.special_tokens:
|
||||||
|
return self.special_tokens[tokens]
|
||||||
|
else:
|
||||||
|
return self.mergeable_ranks.get(tokens)
|
||||||
|
for token in tokens:
|
||||||
|
if token in self.special_tokens:
|
||||||
|
ids.append(self.special_tokens[token])
|
||||||
|
else:
|
||||||
|
ids.append(self.mergeable_ranks.get(token))
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def _add_tokens(
|
||||||
|
self,
|
||||||
|
new_tokens: Union[List[str], List[AddedToken]],
|
||||||
|
special_tokens: bool = False,
|
||||||
|
) -> int:
|
||||||
|
if not special_tokens and new_tokens:
|
||||||
|
raise ValueError("Adding regular tokens is not supported")
|
||||||
|
for token in new_tokens:
|
||||||
|
surface_form = token.content if isinstance(token, AddedToken) else token
|
||||||
|
if surface_form not in SPECIAL_TOKENS_SET:
|
||||||
|
raise ValueError("Adding unknown special tokens is not supported")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
|
||||||
|
"""
|
||||||
|
Save only the vocabulary of the tokenizer (vocabulary).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`Tuple(str)`: Paths to the files saved.
|
||||||
|
"""
|
||||||
|
file_path = os.path.join(save_directory, "qwen.tiktoken")
|
||||||
|
with open(file_path, "w", encoding="utf8") as w:
|
||||||
|
for k, v in self.mergeable_ranks.items():
|
||||||
|
line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
|
||||||
|
w.write(line)
|
||||||
|
return (file_path,)
|
||||||
|
|
||||||
|
def tokenize(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
allowed_special: Union[Set, str] = "all",
|
||||||
|
disallowed_special: Union[Collection, str] = (),
|
||||||
|
**kwargs,
|
||||||
|
) -> List[Union[bytes, str]]:
|
||||||
|
"""
|
||||||
|
Converts a string in a sequence of tokens.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (`str`):
|
||||||
|
The sequence to be encoded.
|
||||||
|
allowed_special (`Literal["all"]` or `set`):
|
||||||
|
The surface forms of the tokens to be encoded as special tokens in regular texts.
|
||||||
|
Default to "all".
|
||||||
|
disallowed_special (`Literal["all"]` or `Collection`):
|
||||||
|
The surface forms of the tokens that should not be in regular texts and trigger errors.
|
||||||
|
Default to an empty tuple.
|
||||||
|
|
||||||
|
kwargs (additional keyword arguments, *optional*):
|
||||||
|
Will be passed to the underlying model specific encode method.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
`List[bytes|str]`: The list of tokens.
|
||||||
|
"""
|
||||||
|
tokens = []
|
||||||
|
text = unicodedata.normalize("NFC", text)
|
||||||
|
|
||||||
|
# this implementation takes a detour: text -> token id -> token surface forms
|
||||||
|
for t in self.tokenizer.encode(
|
||||||
|
text, allowed_special=allowed_special, disallowed_special=disallowed_special
|
||||||
|
):
|
||||||
|
tokens.append(self.decoder[t])
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
|
||||||
|
"""
|
||||||
|
Converts a sequence of tokens in a single string.
|
||||||
|
"""
|
||||||
|
text = ""
|
||||||
|
temp = b""
|
||||||
|
for t in tokens:
|
||||||
|
if isinstance(t, str):
|
||||||
|
if temp:
|
||||||
|
text += temp.decode("utf-8", errors=self.errors)
|
||||||
|
temp = b""
|
||||||
|
text += t
|
||||||
|
elif isinstance(t, bytes):
|
||||||
|
temp += t
|
||||||
|
else:
|
||||||
|
raise TypeError("token should only be of type types or str")
|
||||||
|
if temp:
|
||||||
|
text += temp.decode("utf-8", errors=self.errors)
|
||||||
|
return text
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
return self.tokenizer.n_vocab
|
||||||
|
|
||||||
|
def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
|
||||||
|
"""Converts an id to a token, special tokens included"""
|
||||||
|
if index in self.decoder:
|
||||||
|
return self.decoder[index]
|
||||||
|
raise ValueError("unknown ids")
|
||||||
|
|
||||||
|
def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
|
||||||
|
"""Converts a token to an id using the vocab, special tokens included"""
|
||||||
|
if token in self.special_tokens:
|
||||||
|
return self.special_tokens[token]
|
||||||
|
if token in self.mergeable_ranks:
|
||||||
|
return self.mergeable_ranks[token]
|
||||||
|
raise ValueError("unknown token")
|
||||||
|
|
||||||
|
def _tokenize(self, text: str, **kwargs):
|
||||||
|
"""
|
||||||
|
Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
|
||||||
|
vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
|
||||||
|
|
||||||
|
Do NOT take care of added tokens.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _decode(
|
||||||
|
self,
|
||||||
|
token_ids: Union[int, List[int]],
|
||||||
|
skip_special_tokens: bool = False,
|
||||||
|
errors: str = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> str:
|
||||||
|
if isinstance(token_ids, int):
|
||||||
|
token_ids = [token_ids]
|
||||||
|
if skip_special_tokens:
|
||||||
|
token_ids = [i for i in token_ids if i < self.eod_id]
|
||||||
|
return self.tokenizer.decode(token_ids, errors=errors or self.errors)
|
|
@ -0,0 +1,17 @@
|
||||||
|
{
|
||||||
|
"added_tokens_decoder": {},
|
||||||
|
"auto_map": {
|
||||||
|
"AutoTokenizer": [
|
||||||
|
"tokenization_qwen.QWenTokenizer",
|
||||||
|
null
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
|
||||||
|
"clean_up_tokenization_spaces": true,
|
||||||
|
"eos_token": "<|im_end|>",
|
||||||
|
"model_max_length": 32768,
|
||||||
|
"pad_token": "<|im_end|>",
|
||||||
|
"padding_side": "right",
|
||||||
|
"split_special_tokens": false,
|
||||||
|
"tokenizer_class": "QWenTokenizer"
|
||||||
|
}
|
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"epoch": 0.08888888888888889,
|
||||||
|
"num_input_tokens_seen": 132048,
|
||||||
|
"total_flos": 5638623387844608.0,
|
||||||
|
"train_loss": 1.4425424909591675,
|
||||||
|
"train_runtime": 174.0133,
|
||||||
|
"train_samples_per_second": 4.597,
|
||||||
|
"train_steps_per_second": 0.287,
|
||||||
|
"train_tokens_per_second": 1140.143
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
{"current_steps": 3, "total_steps": 50, "loss": 1.5189, "learning_rate": 4e-05, "epoch": 0.005333333333333333, "percentage": 6.0, "elapsed_time": "0:00:12", "remaining_time": "0:03:13", "throughput": 793.94, "total_tokens": 9808}
|
||||||
|
{"current_steps": 6, "total_steps": 50, "loss": 1.5504, "learning_rate": 6e-05, "epoch": 0.010666666666666666, "percentage": 12.0, "elapsed_time": "0:00:22", "remaining_time": "0:02:45", "throughput": 856.96, "total_tokens": 19312}
|
||||||
|
{"current_steps": 9, "total_steps": 50, "loss": 1.5661, "learning_rate": 9.987820251299122e-05, "epoch": 0.016, "percentage": 18.0, "elapsed_time": "0:00:33", "remaining_time": "0:02:31", "throughput": 880.54, "total_tokens": 29232}
|
||||||
|
{"current_steps": 12, "total_steps": 50, "loss": 1.7033, "learning_rate": 9.806308479691595e-05, "epoch": 0.021333333333333333, "percentage": 24.0, "elapsed_time": "0:00:43", "remaining_time": "0:02:18", "throughput": 870.04, "total_tokens": 37984}
|
||||||
|
{"current_steps": 15, "total_steps": 50, "loss": 1.4225, "learning_rate": 9.567727288213005e-05, "epoch": 0.02666666666666667, "percentage": 30.0, "elapsed_time": "0:00:53", "remaining_time": "0:02:05", "throughput": 826.38, "total_tokens": 44592}
|
||||||
|
{"current_steps": 18, "total_steps": 50, "loss": 1.4217, "learning_rate": 9.24024048078213e-05, "epoch": 0.032, "percentage": 36.0, "elapsed_time": "0:01:04", "remaining_time": "0:01:54", "throughput": 815.59, "total_tokens": 52400}
|
||||||
|
{"current_steps": 21, "total_steps": 50, "loss": 1.2793, "learning_rate": 8.596699001693255e-05, "epoch": 0.037333333333333336, "percentage": 42.0, "elapsed_time": "0:01:14", "remaining_time": "0:01:42", "throughput": 808.87, "total_tokens": 60320}
|
||||||
|
{"current_steps": 24, "total_steps": 50, "loss": 1.4875, "learning_rate": 7.795964517353735e-05, "epoch": 0.042666666666666665, "percentage": 48.0, "elapsed_time": "0:01:25", "remaining_time": "0:01:32", "throughput": 786.17, "total_tokens": 67024}
|
||||||
|
{"current_steps": 27, "total_steps": 50, "loss": 1.2446, "learning_rate": 6.873032967079561e-05, "epoch": 0.048, "percentage": 54.0, "elapsed_time": "0:01:35", "remaining_time": "0:01:21", "throughput": 769.07, "total_tokens": 73776}
|
||||||
|
{"current_steps": 30, "total_steps": 50, "loss": 1.4691, "learning_rate": 5.868240888334653e-05, "epoch": 0.05333333333333334, "percentage": 60.0, "elapsed_time": "0:01:46", "remaining_time": "0:01:11", "throughput": 774.71, "total_tokens": 82592}
|
||||||
|
{"current_steps": 33, "total_steps": 50, "loss": 1.4451, "learning_rate": 4.825502516487497e-05, "epoch": 0.058666666666666666, "percentage": 66.0, "elapsed_time": "0:01:56", "remaining_time": "0:01:00", "throughput": 774.14, "total_tokens": 90512}
|
||||||
|
{"current_steps": 36, "total_steps": 50, "loss": 1.4139, "learning_rate": 3.790390522001662e-05, "epoch": 0.064, "percentage": 72.0, "elapsed_time": "0:02:06", "remaining_time": "0:00:49", "throughput": 763.48, "total_tokens": 96848}
|
||||||
|
{"current_steps": 39, "total_steps": 50, "loss": 1.3205, "learning_rate": 2.8081442660546125e-05, "epoch": 0.06933333333333333, "percentage": 78.0, "elapsed_time": "0:02:16", "remaining_time": "0:00:38", "throughput": 758.29, "total_tokens": 103728}
|
||||||
|
{"current_steps": 42, "total_steps": 50, "loss": 1.2969, "learning_rate": 1.9216926233717085e-05, "epoch": 0.07466666666666667, "percentage": 84.0, "elapsed_time": "0:02:26", "remaining_time": "0:00:27", "throughput": 763.76, "total_tokens": 112160}
|
||||||
|
{"current_steps": 45, "total_steps": 50, "loss": 1.5026, "learning_rate": 1.1697777844051105e-05, "epoch": 0.08, "percentage": 90.0, "elapsed_time": "0:02:36", "remaining_time": "0:00:17", "throughput": 753.01, "total_tokens": 117984}
|
||||||
|
{"current_steps": 48, "total_steps": 50, "loss": 1.3583, "learning_rate": 5.852620357053651e-06, "epoch": 0.08533333333333333, "percentage": 96.0, "elapsed_time": "0:02:46", "remaining_time": "0:00:06", "throughput": 760.49, "total_tokens": 126624}
|
||||||
|
{"current_steps": 50, "total_steps": 50, "epoch": 0.08888888888888889, "percentage": 100.0, "elapsed_time": "0:02:54", "remaining_time": "0:00:00", "throughput": 758.87, "total_tokens": 132048}
|
|
@ -0,0 +1,172 @@
|
||||||
|
{
|
||||||
|
"best_metric": null,
|
||||||
|
"best_model_checkpoint": null,
|
||||||
|
"epoch": 0.08888888888888889,
|
||||||
|
"eval_steps": 500,
|
||||||
|
"global_step": 50,
|
||||||
|
"is_hyper_param_search": false,
|
||||||
|
"is_local_process_zero": true,
|
||||||
|
"is_world_process_zero": true,
|
||||||
|
"log_history": [
|
||||||
|
{
|
||||||
|
"epoch": 0.005333333333333333,
|
||||||
|
"grad_norm": 0.8999722599983215,
|
||||||
|
"learning_rate": 4e-05,
|
||||||
|
"loss": 1.5189,
|
||||||
|
"num_input_tokens_seen": 9808,
|
||||||
|
"step": 3
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.010666666666666666,
|
||||||
|
"grad_norm": NaN,
|
||||||
|
"learning_rate": 6e-05,
|
||||||
|
"loss": 1.5504,
|
||||||
|
"num_input_tokens_seen": 19312,
|
||||||
|
"step": 6
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.016,
|
||||||
|
"grad_norm": 0.9268227219581604,
|
||||||
|
"learning_rate": 9.987820251299122e-05,
|
||||||
|
"loss": 1.5661,
|
||||||
|
"num_input_tokens_seen": 29232,
|
||||||
|
"step": 9
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.021333333333333333,
|
||||||
|
"grad_norm": 1.1588999032974243,
|
||||||
|
"learning_rate": 9.806308479691595e-05,
|
||||||
|
"loss": 1.7033,
|
||||||
|
"num_input_tokens_seen": 37984,
|
||||||
|
"step": 12
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.02666666666666667,
|
||||||
|
"grad_norm": 1.0571134090423584,
|
||||||
|
"learning_rate": 9.567727288213005e-05,
|
||||||
|
"loss": 1.4225,
|
||||||
|
"num_input_tokens_seen": 44592,
|
||||||
|
"step": 15
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.032,
|
||||||
|
"grad_norm": 1.720107913017273,
|
||||||
|
"learning_rate": 9.24024048078213e-05,
|
||||||
|
"loss": 1.4217,
|
||||||
|
"num_input_tokens_seen": 52400,
|
||||||
|
"step": 18
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.037333333333333336,
|
||||||
|
"grad_norm": 0.930574893951416,
|
||||||
|
"learning_rate": 8.596699001693255e-05,
|
||||||
|
"loss": 1.2793,
|
||||||
|
"num_input_tokens_seen": 60320,
|
||||||
|
"step": 21
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.042666666666666665,
|
||||||
|
"grad_norm": 1.6979925632476807,
|
||||||
|
"learning_rate": 7.795964517353735e-05,
|
||||||
|
"loss": 1.4875,
|
||||||
|
"num_input_tokens_seen": 67024,
|
||||||
|
"step": 24
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.048,
|
||||||
|
"grad_norm": 2.2298834323883057,
|
||||||
|
"learning_rate": 6.873032967079561e-05,
|
||||||
|
"loss": 1.2446,
|
||||||
|
"num_input_tokens_seen": 73776,
|
||||||
|
"step": 27
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.05333333333333334,
|
||||||
|
"grad_norm": 1.6609553098678589,
|
||||||
|
"learning_rate": 5.868240888334653e-05,
|
||||||
|
"loss": 1.4691,
|
||||||
|
"num_input_tokens_seen": 82592,
|
||||||
|
"step": 30
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.058666666666666666,
|
||||||
|
"grad_norm": 1.1659108400344849,
|
||||||
|
"learning_rate": 4.825502516487497e-05,
|
||||||
|
"loss": 1.4451,
|
||||||
|
"num_input_tokens_seen": 90512,
|
||||||
|
"step": 33
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.064,
|
||||||
|
"grad_norm": 1.2636826038360596,
|
||||||
|
"learning_rate": 3.790390522001662e-05,
|
||||||
|
"loss": 1.4139,
|
||||||
|
"num_input_tokens_seen": 96848,
|
||||||
|
"step": 36
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.06933333333333333,
|
||||||
|
"grad_norm": 3.8678996562957764,
|
||||||
|
"learning_rate": 2.8081442660546125e-05,
|
||||||
|
"loss": 1.3205,
|
||||||
|
"num_input_tokens_seen": 103728,
|
||||||
|
"step": 39
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.07466666666666667,
|
||||||
|
"grad_norm": 0.6766985654830933,
|
||||||
|
"learning_rate": 1.9216926233717085e-05,
|
||||||
|
"loss": 1.2969,
|
||||||
|
"num_input_tokens_seen": 112160,
|
||||||
|
"step": 42
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.08,
|
||||||
|
"grad_norm": 0.7236246466636658,
|
||||||
|
"learning_rate": 1.1697777844051105e-05,
|
||||||
|
"loss": 1.5026,
|
||||||
|
"num_input_tokens_seen": 117984,
|
||||||
|
"step": 45
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.08533333333333333,
|
||||||
|
"grad_norm": 0.8583828806877136,
|
||||||
|
"learning_rate": 5.852620357053651e-06,
|
||||||
|
"loss": 1.3583,
|
||||||
|
"num_input_tokens_seen": 126624,
|
||||||
|
"step": 48
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"epoch": 0.08888888888888889,
|
||||||
|
"num_input_tokens_seen": 132048,
|
||||||
|
"step": 50,
|
||||||
|
"total_flos": 5638623387844608.0,
|
||||||
|
"train_loss": 1.4425424909591675,
|
||||||
|
"train_runtime": 174.0133,
|
||||||
|
"train_samples_per_second": 4.597,
|
||||||
|
"train_steps_per_second": 0.287,
|
||||||
|
"train_tokens_per_second": 1140.143
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"logging_steps": 3,
|
||||||
|
"max_steps": 50,
|
||||||
|
"num_input_tokens_seen": 132048,
|
||||||
|
"num_train_epochs": 1,
|
||||||
|
"save_steps": 500,
|
||||||
|
"stateful_callbacks": {
|
||||||
|
"TrainerControl": {
|
||||||
|
"args": {
|
||||||
|
"should_epoch_stop": false,
|
||||||
|
"should_evaluate": false,
|
||||||
|
"should_log": false,
|
||||||
|
"should_save": true,
|
||||||
|
"should_training_stop": true
|
||||||
|
},
|
||||||
|
"attributes": {}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"total_flos": 5638623387844608.0,
|
||||||
|
"train_batch_size": 2,
|
||||||
|
"trial_name": null,
|
||||||
|
"trial_params": null
|
||||||
|
}
|
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 46 KiB |
Loading…
Reference in New Issue