testrun: test 910b qwen

2024-09-18 15:55:06 +08:00 · 2024-09-18 15:55:06 +08:00 · 8d6f544698
parent 9469b4c256
commit 8d6f544698
35 changed files with 304794 additions and 0 deletions
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834/gpu_status_20240918152835.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834/gpu_status_20240918152835.json
@ -0,0 +1 @@
+{"cur_time": "2024-09-18 15:28:38", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834/log.txt
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834/log.txt
@ -0,0 +1,7 @@
+[2024-09-18 15:28:52,091] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+09/18/2024 15:28:57 - INFO - llamafactory.hparams.parser - Process rank: 0, device: npu:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16
+09/18/2024 15:28:58 - INFO - llamafactory.data.template - Add eos token: <|im_end|>
+09/18/2024 15:28:58 - INFO - llamafactory.data.template - Add pad token: <|im_end|>
+09/18/2024 15:28:58 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN...
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834.yaml
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834.yaml
@ -0,0 +1,31 @@
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+fp16: true
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: 50
+model_name_or_path: ../../../models/qwen
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152834
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: qwen
+val_size: 0.1
+warmup_ratio: 0.1
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941/gpu_status_20240918152941.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941/gpu_status_20240918152941.json
@ -0,0 +1 @@
+{"cur_time": "2024-09-18 15:29:45", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941/log.txt
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941/log.txt
@ -0,0 +1,7 @@
+[2024-09-18 15:29:56,836] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+09/18/2024 15:30:02 - INFO - llamafactory.hparams.parser - Process rank: 0, device: npu:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16
+09/18/2024 15:30:03 - INFO - llamafactory.data.template - Add eos token: <|im_end|>
+09/18/2024 15:30:03 - INFO - llamafactory.data.template - Add pad token: <|im_end|>
+09/18/2024 15:30:03 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN...
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941.yaml
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941.yaml
@ -0,0 +1,31 @@
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+fp16: true
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: 50
+model_name_or_path: ../../../models/qwen
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Qwen-7B_8_gpu_50_step_20240918152941
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: qwen
+val_size: 0.1
+warmup_ratio: 0.1
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/README.md
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/README.md
@ -0,0 +1,63 @@
+---
+base_model: ../../../models/qwen
+library_name: peft
+license: other
+tags:
+- llama-factory
+- lora
+- generated_from_trainer
+model-index:
+- name: lora_sft_Qwen-7B_8_gpu_50_step_20240918153046
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+# lora_sft_Qwen-7B_8_gpu_50_step_20240918153046
+
+This model is a fine-tuned version of [../../../models/qwen](https://huggingface.co/../../../models/qwen) on the belle_1m dataset.
+It achieves the following results on the evaluation set:
+- Loss: 1.3353
+- Num Input Tokens Seen: 132048
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 0.0001
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- training_steps: 50
+- mixed_precision_training: Native AMP
+
+### Training results
+
+
+
+### Framework versions
+
+- PEFT 0.12.0
+- Transformers 4.43.4
+- Pytorch 2.1.0
+- Datasets 2.20.0
+- Tokenizers 0.19.1
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/adapter_config.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/adapter_config.json
@ -0,0 +1,31 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "../../../models/qwen",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "w2",
+    "c_proj",
+    "w1",
+    "c_attn"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/adapter_model.safetensors
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/adapter_model.safetensors
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/all_results.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/all_results.json
@ -0,0 +1,14 @@
+{
+    "epoch": 0.08888888888888889,
+    "eval_loss": 1.3352934122085571,
+    "eval_runtime": 55.0087,
+    "eval_samples_per_second": 18.179,
+    "eval_steps_per_second": 9.089,
+    "num_input_tokens_seen": 132048,
+    "total_flos": 5638623387844608.0,
+    "train_loss": 1.4425424909591675,
+    "train_runtime": 174.0133,
+    "train_samples_per_second": 4.597,
+    "train_steps_per_second": 0.287,
+    "train_tokens_per_second": 1140.143
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/README.md
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/README.md
@ -0,0 +1,202 @@
+---
+base_model: ../../../models/qwen
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.12.0
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/adapter_config.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/adapter_config.json
@ -0,0 +1,31 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "../../../models/qwen",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "w2",
+    "c_proj",
+    "w1",
+    "c_attn"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/adapter_model.safetensors
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/adapter_model.safetensors
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/optimizer.pt
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/optimizer.pt
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/qwen.tiktoken
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/qwen.tiktoken
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/rng_state.pth
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/rng_state.pth
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/scheduler.pt
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/scheduler.pt
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/special_tokens_map.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/special_tokens_map.json
@ -0,0 +1,10 @@
+{
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/tokenization_qwen.py
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/tokenization_qwen.py
@ -0,0 +1,276 @@
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tokenization classes for QWen."""
+
+import base64
+import logging
+import os
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union
+
+import tiktoken
+from transformers import PreTrainedTokenizer, AddedToken
+
+logger = logging.getLogger(__name__)
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
+
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+SPECIAL_START_ID = 151643
+SPECIAL_TOKENS = tuple(
+    enumerate(
+        (
+            (
+                ENDOFTEXT,
+                IMSTART,
+                IMEND,
+            )
+            + EXTRAS
+        ),
+        start=SPECIAL_START_ID,
+    )
+)
+SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
+
+
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+
+
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        extra_vocab_file=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors  
+
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in SPECIAL_TOKENS
+        }
+
+        # try load extra vocab from file
+        if extra_vocab_file is not None:
+            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
+            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
+            for token, index in extra_mergeable_ranks.items():
+                if token in self.mergeable_ranks:
+                    logger.info(f"extra token {token} exists, skipping")
+                    continue
+                if index in used_ids:
+                    logger.info(f'the index {index} for extra token {token} exists, skipping')
+                    continue
+                self.mergeable_ranks[token] = index
+            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
+
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+
+        self.tokenizer = enc  # type: tiktoken.Encoding
+
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS_SET:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/tokenizer_config.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/tokenizer_config.json
@ -0,0 +1,17 @@
+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_qwen.QWenTokenizer",
+      null
+    ]
+  },
+  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|im_end|>",
+  "model_max_length": 32768,
+  "pad_token": "<|im_end|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "QWenTokenizer"
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/trainer_state.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/trainer_state.json
@ -0,0 +1,161 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.08888888888888889,
+  "eval_steps": 500,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.8999722599983215,
+      "learning_rate": 4e-05,
+      "loss": 1.5189,
+      "num_input_tokens_seen": 9808,
+      "step": 3
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": NaN,
+      "learning_rate": 6e-05,
+      "loss": 1.5504,
+      "num_input_tokens_seen": 19312,
+      "step": 6
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9268227219581604,
+      "learning_rate": 9.987820251299122e-05,
+      "loss": 1.5661,
+      "num_input_tokens_seen": 29232,
+      "step": 9
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 1.1588999032974243,
+      "learning_rate": 9.806308479691595e-05,
+      "loss": 1.7033,
+      "num_input_tokens_seen": 37984,
+      "step": 12
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 1.0571134090423584,
+      "learning_rate": 9.567727288213005e-05,
+      "loss": 1.4225,
+      "num_input_tokens_seen": 44592,
+      "step": 15
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.720107913017273,
+      "learning_rate": 9.24024048078213e-05,
+      "loss": 1.4217,
+      "num_input_tokens_seen": 52400,
+      "step": 18
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.930574893951416,
+      "learning_rate": 8.596699001693255e-05,
+      "loss": 1.2793,
+      "num_input_tokens_seen": 60320,
+      "step": 21
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 1.6979925632476807,
+      "learning_rate": 7.795964517353735e-05,
+      "loss": 1.4875,
+      "num_input_tokens_seen": 67024,
+      "step": 24
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 2.2298834323883057,
+      "learning_rate": 6.873032967079561e-05,
+      "loss": 1.2446,
+      "num_input_tokens_seen": 73776,
+      "step": 27
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 1.6609553098678589,
+      "learning_rate": 5.868240888334653e-05,
+      "loss": 1.4691,
+      "num_input_tokens_seen": 82592,
+      "step": 30
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 1.1659108400344849,
+      "learning_rate": 4.825502516487497e-05,
+      "loss": 1.4451,
+      "num_input_tokens_seen": 90512,
+      "step": 33
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.2636826038360596,
+      "learning_rate": 3.790390522001662e-05,
+      "loss": 1.4139,
+      "num_input_tokens_seen": 96848,
+      "step": 36
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 3.8678996562957764,
+      "learning_rate": 2.8081442660546125e-05,
+      "loss": 1.3205,
+      "num_input_tokens_seen": 103728,
+      "step": 39
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.6766985654830933,
+      "learning_rate": 1.9216926233717085e-05,
+      "loss": 1.2969,
+      "num_input_tokens_seen": 112160,
+      "step": 42
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7236246466636658,
+      "learning_rate": 1.1697777844051105e-05,
+      "loss": 1.5026,
+      "num_input_tokens_seen": 117984,
+      "step": 45
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.8583828806877136,
+      "learning_rate": 5.852620357053651e-06,
+      "loss": 1.3583,
+      "num_input_tokens_seen": 126624,
+      "step": 48
+    }
+  ],
+  "logging_steps": 3,
+  "max_steps": 50,
+  "num_input_tokens_seen": 132048,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5638623387844608.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/training_args.bin
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/checkpoint-50/training_args.bin
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/eval_results.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/eval_results.json
@ -0,0 +1,8 @@
+{
+    "epoch": 0.08888888888888889,
+    "eval_loss": 1.3352934122085571,
+    "eval_runtime": 55.0087,
+    "eval_samples_per_second": 18.179,
+    "eval_steps_per_second": 9.089,
+    "num_input_tokens_seen": 132048
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/gpu_status_20240918153046.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/gpu_status_20240918153046.json
@ -0,0 +1,9 @@
+{"cur_time": "2024-09-18 15:30:50", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
+{"cur_time": "2024-09-18 15:31:52", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
+{"cur_time": "2024-09-18 15:32:54", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
+{"cur_time": "2024-09-18 15:33:57", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
+{"cur_time": "2024-09-18 15:34:59", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
+{"cur_time": "2024-09-18 15:36:01", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
+{"cur_time": "2024-09-18 15:37:04", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
+{"cur_time": "2024-09-18 15:38:06", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
+{"cur_time": "2024-09-18 15:39:08", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/log.txt
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/log.txt
@ -0,0 +1,75 @@
+[2024-09-18 15:31:02,784] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to npu (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+09/18/2024 15:31:08 - INFO - llamafactory.hparams.parser - Process rank: 0, device: npu:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16
+09/18/2024 15:31:09 - INFO - llamafactory.data.template - Add eos token: <|im_end|>
+09/18/2024 15:31:09 - INFO - llamafactory.data.template - Add pad token: <|im_end|>
+09/18/2024 15:31:09 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN...
+training example:
+input_ids:
+[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 104317, 89012, 22382, 106096, 64471, 101137, 72881, 102648, 46448, 1773, 62244, 107132, 37945, 99553, 25177, 101898, 8997, 100431, 99639, 113773, 9370, 111749, 25, 330, 100012, 105435, 99487, 100220, 3837, 104817, 44063, 99553, 102322, 20074, 33108, 116993, 3837, 23031, 104022, 100147, 101313, 1773, 698, 151645, 198, 151644, 77091, 198, 99487, 111749, 101137, 72881, 102648, 46448, 1773, 151645]
+inputs:
+<|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>user
+判断给定的文章是否符合语法规则。如果不符合，请提供修改建议。
+下面是一篇文章的开头: "为了探讨这个主题，本文将提供一系列数据和实例，以证明这一观点。"
+<|im_end|>
+<|im_start|>assistant
+这个开头符合语法规则。<|im_end|>
+label_ids:
+[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 99487, 111749, 101137, 72881, 102648, 46448, 1773, 151645]
+labels:
+这个开头符合语法规则。<|im_end|>
+09/18/2024 15:35:27 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
+09/18/2024 15:35:27 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation.
+09/18/2024 15:35:27 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
+09/18/2024 15:35:27 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA
+09/18/2024 15:35:27 - INFO - llamafactory.model.model_utils.misc - Found linear modules: w2,w1,c_proj,c_attn
+09/18/2024 15:35:28 - INFO - llamafactory.model.loader - trainable params: 17,891,328 || all params: 7,739,215,872 || trainable%: 0.2312
+Gradient overflow. Skipping step
+Loss scaler reducing loss scale to 32768.0
+{'loss': 1.5189, 'grad_norm': 0.8999722599983215, 'learning_rate': 4e-05, 'epoch': 0.01, 'num_input_tokens_seen': 9808}
+Gradient overflow. Skipping step
+Loss scaler reducing loss scale to 16384.0
+Gradient overflow. Skipping step
+Loss scaler reducing loss scale to 8192.0
+{'loss': 1.5504, 'grad_norm': nan, 'learning_rate': 6e-05, 'epoch': 0.01, 'num_input_tokens_seen': 19312}
+{'loss': 1.5661, 'grad_norm': 0.9268227219581604, 'learning_rate': 9.987820251299122e-05, 'epoch': 0.02, 'num_input_tokens_seen': 29232}
+{'loss': 1.7033, 'grad_norm': 1.1588999032974243, 'learning_rate': 9.806308479691595e-05, 'epoch': 0.02, 'num_input_tokens_seen': 37984}
+Gradient overflow. Skipping step
+Loss scaler reducing loss scale to 4096.0
+{'loss': 1.4225, 'grad_norm': 1.0571134090423584, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.03, 'num_input_tokens_seen': 44592}
+Gradient overflow. Skipping step
+Loss scaler reducing loss scale to 2048.0
+{'loss': 1.4217, 'grad_norm': 1.720107913017273, 'learning_rate': 9.24024048078213e-05, 'epoch': 0.03, 'num_input_tokens_seen': 52400}
+{'loss': 1.2793, 'grad_norm': 0.930574893951416, 'learning_rate': 8.596699001693255e-05, 'epoch': 0.04, 'num_input_tokens_seen': 60320}
+{'loss': 1.4875, 'grad_norm': 1.6979925632476807, 'learning_rate': 7.795964517353735e-05, 'epoch': 0.04, 'num_input_tokens_seen': 67024}
+{'loss': 1.2446, 'grad_norm': 2.2298834323883057, 'learning_rate': 6.873032967079561e-05, 'epoch': 0.05, 'num_input_tokens_seen': 73776}
+{'loss': 1.4691, 'grad_norm': 1.6609553098678589, 'learning_rate': 5.868240888334653e-05, 'epoch': 0.05, 'num_input_tokens_seen': 82592}
+{'loss': 1.4451, 'grad_norm': 1.1659108400344849, 'learning_rate': 4.825502516487497e-05, 'epoch': 0.06, 'num_input_tokens_seen': 90512}
+{'loss': 1.4139, 'grad_norm': 1.2636826038360596, 'learning_rate': 3.790390522001662e-05, 'epoch': 0.06, 'num_input_tokens_seen': 96848}
+{'loss': 1.3205, 'grad_norm': 3.8678996562957764, 'learning_rate': 2.8081442660546125e-05, 'epoch': 0.07, 'num_input_tokens_seen': 103728}
+{'loss': 1.2969, 'grad_norm': 0.6766985654830933, 'learning_rate': 1.9216926233717085e-05, 'epoch': 0.07, 'num_input_tokens_seen': 112160}
+{'loss': 1.5026, 'grad_norm': 0.7236246466636658, 'learning_rate': 1.1697777844051105e-05, 'epoch': 0.08, 'num_input_tokens_seen': 117984}
+{'loss': 1.3583, 'grad_norm': 0.8583828806877136, 'learning_rate': 5.852620357053651e-06, 'epoch': 0.09, 'num_input_tokens_seen': 126624}
+{'train_runtime': 174.0133, 'train_samples_per_second': 4.597, 'train_steps_per_second': 0.287, 'train_tokens_per_second': 1140.143, 'train_loss': 1.4425424909591675, 'epoch': 0.09, 'num_input_tokens_seen': 132048}
+***** train metrics *****
+  epoch                    =     0.0889
+  num_input_tokens_seen    =     132048
+  total_flos               =  5251377GF
+  train_loss               =     1.4425
+  train_runtime            = 0:02:54.01
+  train_samples_per_second =      4.597
+  train_steps_per_second   =      0.287
+  train_tokens_per_second  =   1140.143
+Figure saved at: ./results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/training_loss.png
+09/18/2024 15:38:23 - WARNING - llamafactory.extras.ploting - No metric eval_loss to plot.
+09/18/2024 15:38:23 - WARNING - llamafactory.extras.ploting - No metric eval_accuracy to plot.
+***** eval metrics *****
+  epoch                   =     0.0889
+  eval_loss               =     1.3353
+  eval_runtime            = 0:00:55.00
+  eval_samples_per_second =     18.179
+  eval_steps_per_second   =      9.089
+  num_input_tokens_seen   =     132048
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046.yaml
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046.yaml
@ -0,0 +1,31 @@
+cutoff_len: 1024
+dataset: belle_1m
+ddp_timeout: 180000000
+do_train: true
+eval_steps: 500
+eval_strategy: steps
+finetuning_type: lora
+fp16: true
+gradient_accumulation_steps: 8
+include_num_input_tokens_seen: true
+include_tokens_per_second: true
+learning_rate: 0.0001
+logging_steps: 3
+lora_target: all
+lr_scheduler_type: cosine
+max_samples: 10000
+max_steps: 50
+model_name_or_path: ../../../models/qwen
+num_train_epochs: 10.0
+output_dir: ./results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046
+overwrite_cache: true
+overwrite_output_dir: true
+per_device_eval_batch_size: 2
+per_device_train_batch_size: 2
+plot_loss: true
+preprocessing_num_workers: 16
+save_steps: 500
+stage: sft
+template: qwen
+val_size: 0.1
+warmup_ratio: 0.1
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/qwen.tiktoken
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/qwen.tiktoken
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/special_tokens_map.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/special_tokens_map.json
@ -0,0 +1,10 @@
+{
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/tokenization_qwen.py
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/tokenization_qwen.py
@ -0,0 +1,276 @@
+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tokenization classes for QWen."""
+
+import base64
+import logging
+import os
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union
+
+import tiktoken
+from transformers import PreTrainedTokenizer, AddedToken
+
+logger = logging.getLogger(__name__)
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
+
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+# changed to use actual index to avoid misconfiguration with vocabulary expansion
+SPECIAL_START_ID = 151643
+SPECIAL_TOKENS = tuple(
+    enumerate(
+        (
+            (
+                ENDOFTEXT,
+                IMSTART,
+                IMEND,
+            )
+            + EXTRAS
+        ),
+        start=SPECIAL_START_ID,
+    )
+)
+SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
+
+
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+
+
+class QWenTokenizer(PreTrainedTokenizer):
+    """QWen tokenizer."""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        extra_vocab_file=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # how to handle errors in decoding UTF-8 byte sequences
+        # use ignore if you are in streaming inference
+        self.errors = errors  
+
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in SPECIAL_TOKENS
+        }
+
+        # try load extra vocab from file
+        if extra_vocab_file is not None:
+            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
+            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
+            for token, index in extra_mergeable_ranks.items():
+                if token in self.mergeable_ranks:
+                    logger.info(f"extra token {token} exists, skipping")
+                    continue
+                if index in used_ids:
+                    logger.info(f'the index {index} for extra token {token} exists, skipping')
+                    continue
+                self.mergeable_ranks[token] = index
+            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
+
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+
+        self.tokenizer = enc  # type: tiktoken.Encoding
+
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+
+    def __getstate__(self):
+        # for pickle lovers
+        state = self.__dict__.copy()
+        del state["tokenizer"]
+        return state
+
+    def __setstate__(self, state):
+        # tokenizer is not python native; don't pass it; rebuild it
+        self.__dict__.update(state)
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        self.tokenizer = enc
+
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS_SET:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/tokenizer_config.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/tokenizer_config.json
@ -0,0 +1,17 @@
+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_qwen.QWenTokenizer",
+      null
+    ]
+  },
+  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|im_end|>",
+  "model_max_length": 32768,
+  "pad_token": "<|im_end|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "QWenTokenizer"
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/train_results.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/train_results.json
@ -0,0 +1,10 @@
+{
+    "epoch": 0.08888888888888889,
+    "num_input_tokens_seen": 132048,
+    "total_flos": 5638623387844608.0,
+    "train_loss": 1.4425424909591675,
+    "train_runtime": 174.0133,
+    "train_samples_per_second": 4.597,
+    "train_steps_per_second": 0.287,
+    "train_tokens_per_second": 1140.143
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/trainer_log.jsonl
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/trainer_log.jsonl
@ -0,0 +1,17 @@
+{"current_steps": 3, "total_steps": 50, "loss": 1.5189, "learning_rate": 4e-05, "epoch": 0.005333333333333333, "percentage": 6.0, "elapsed_time": "0:00:12", "remaining_time": "0:03:13", "throughput": 793.94, "total_tokens": 9808}
+{"current_steps": 6, "total_steps": 50, "loss": 1.5504, "learning_rate": 6e-05, "epoch": 0.010666666666666666, "percentage": 12.0, "elapsed_time": "0:00:22", "remaining_time": "0:02:45", "throughput": 856.96, "total_tokens": 19312}
+{"current_steps": 9, "total_steps": 50, "loss": 1.5661, "learning_rate": 9.987820251299122e-05, "epoch": 0.016, "percentage": 18.0, "elapsed_time": "0:00:33", "remaining_time": "0:02:31", "throughput": 880.54, "total_tokens": 29232}
+{"current_steps": 12, "total_steps": 50, "loss": 1.7033, "learning_rate": 9.806308479691595e-05, "epoch": 0.021333333333333333, "percentage": 24.0, "elapsed_time": "0:00:43", "remaining_time": "0:02:18", "throughput": 870.04, "total_tokens": 37984}
+{"current_steps": 15, "total_steps": 50, "loss": 1.4225, "learning_rate": 9.567727288213005e-05, "epoch": 0.02666666666666667, "percentage": 30.0, "elapsed_time": "0:00:53", "remaining_time": "0:02:05", "throughput": 826.38, "total_tokens": 44592}
+{"current_steps": 18, "total_steps": 50, "loss": 1.4217, "learning_rate": 9.24024048078213e-05, "epoch": 0.032, "percentage": 36.0, "elapsed_time": "0:01:04", "remaining_time": "0:01:54", "throughput": 815.59, "total_tokens": 52400}
+{"current_steps": 21, "total_steps": 50, "loss": 1.2793, "learning_rate": 8.596699001693255e-05, "epoch": 0.037333333333333336, "percentage": 42.0, "elapsed_time": "0:01:14", "remaining_time": "0:01:42", "throughput": 808.87, "total_tokens": 60320}
+{"current_steps": 24, "total_steps": 50, "loss": 1.4875, "learning_rate": 7.795964517353735e-05, "epoch": 0.042666666666666665, "percentage": 48.0, "elapsed_time": "0:01:25", "remaining_time": "0:01:32", "throughput": 786.17, "total_tokens": 67024}
+{"current_steps": 27, "total_steps": 50, "loss": 1.2446, "learning_rate": 6.873032967079561e-05, "epoch": 0.048, "percentage": 54.0, "elapsed_time": "0:01:35", "remaining_time": "0:01:21", "throughput": 769.07, "total_tokens": 73776}
+{"current_steps": 30, "total_steps": 50, "loss": 1.4691, "learning_rate": 5.868240888334653e-05, "epoch": 0.05333333333333334, "percentage": 60.0, "elapsed_time": "0:01:46", "remaining_time": "0:01:11", "throughput": 774.71, "total_tokens": 82592}
+{"current_steps": 33, "total_steps": 50, "loss": 1.4451, "learning_rate": 4.825502516487497e-05, "epoch": 0.058666666666666666, "percentage": 66.0, "elapsed_time": "0:01:56", "remaining_time": "0:01:00", "throughput": 774.14, "total_tokens": 90512}
+{"current_steps": 36, "total_steps": 50, "loss": 1.4139, "learning_rate": 3.790390522001662e-05, "epoch": 0.064, "percentage": 72.0, "elapsed_time": "0:02:06", "remaining_time": "0:00:49", "throughput": 763.48, "total_tokens": 96848}
+{"current_steps": 39, "total_steps": 50, "loss": 1.3205, "learning_rate": 2.8081442660546125e-05, "epoch": 0.06933333333333333, "percentage": 78.0, "elapsed_time": "0:02:16", "remaining_time": "0:00:38", "throughput": 758.29, "total_tokens": 103728}
+{"current_steps": 42, "total_steps": 50, "loss": 1.2969, "learning_rate": 1.9216926233717085e-05, "epoch": 0.07466666666666667, "percentage": 84.0, "elapsed_time": "0:02:26", "remaining_time": "0:00:27", "throughput": 763.76, "total_tokens": 112160}
+{"current_steps": 45, "total_steps": 50, "loss": 1.5026, "learning_rate": 1.1697777844051105e-05, "epoch": 0.08, "percentage": 90.0, "elapsed_time": "0:02:36", "remaining_time": "0:00:17", "throughput": 753.01, "total_tokens": 117984}
+{"current_steps": 48, "total_steps": 50, "loss": 1.3583, "learning_rate": 5.852620357053651e-06, "epoch": 0.08533333333333333, "percentage": 96.0, "elapsed_time": "0:02:46", "remaining_time": "0:00:06", "throughput": 760.49, "total_tokens": 126624}
+{"current_steps": 50, "total_steps": 50, "epoch": 0.08888888888888889, "percentage": 100.0, "elapsed_time": "0:02:54", "remaining_time": "0:00:00", "throughput": 758.87, "total_tokens": 132048}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/trainer_state.json
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/trainer_state.json
@ -0,0 +1,172 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.08888888888888889,
+  "eval_steps": 500,
+  "global_step": 50,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005333333333333333,
+      "grad_norm": 0.8999722599983215,
+      "learning_rate": 4e-05,
+      "loss": 1.5189,
+      "num_input_tokens_seen": 9808,
+      "step": 3
+    },
+    {
+      "epoch": 0.010666666666666666,
+      "grad_norm": NaN,
+      "learning_rate": 6e-05,
+      "loss": 1.5504,
+      "num_input_tokens_seen": 19312,
+      "step": 6
+    },
+    {
+      "epoch": 0.016,
+      "grad_norm": 0.9268227219581604,
+      "learning_rate": 9.987820251299122e-05,
+      "loss": 1.5661,
+      "num_input_tokens_seen": 29232,
+      "step": 9
+    },
+    {
+      "epoch": 0.021333333333333333,
+      "grad_norm": 1.1588999032974243,
+      "learning_rate": 9.806308479691595e-05,
+      "loss": 1.7033,
+      "num_input_tokens_seen": 37984,
+      "step": 12
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 1.0571134090423584,
+      "learning_rate": 9.567727288213005e-05,
+      "loss": 1.4225,
+      "num_input_tokens_seen": 44592,
+      "step": 15
+    },
+    {
+      "epoch": 0.032,
+      "grad_norm": 1.720107913017273,
+      "learning_rate": 9.24024048078213e-05,
+      "loss": 1.4217,
+      "num_input_tokens_seen": 52400,
+      "step": 18
+    },
+    {
+      "epoch": 0.037333333333333336,
+      "grad_norm": 0.930574893951416,
+      "learning_rate": 8.596699001693255e-05,
+      "loss": 1.2793,
+      "num_input_tokens_seen": 60320,
+      "step": 21
+    },
+    {
+      "epoch": 0.042666666666666665,
+      "grad_norm": 1.6979925632476807,
+      "learning_rate": 7.795964517353735e-05,
+      "loss": 1.4875,
+      "num_input_tokens_seen": 67024,
+      "step": 24
+    },
+    {
+      "epoch": 0.048,
+      "grad_norm": 2.2298834323883057,
+      "learning_rate": 6.873032967079561e-05,
+      "loss": 1.2446,
+      "num_input_tokens_seen": 73776,
+      "step": 27
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 1.6609553098678589,
+      "learning_rate": 5.868240888334653e-05,
+      "loss": 1.4691,
+      "num_input_tokens_seen": 82592,
+      "step": 30
+    },
+    {
+      "epoch": 0.058666666666666666,
+      "grad_norm": 1.1659108400344849,
+      "learning_rate": 4.825502516487497e-05,
+      "loss": 1.4451,
+      "num_input_tokens_seen": 90512,
+      "step": 33
+    },
+    {
+      "epoch": 0.064,
+      "grad_norm": 1.2636826038360596,
+      "learning_rate": 3.790390522001662e-05,
+      "loss": 1.4139,
+      "num_input_tokens_seen": 96848,
+      "step": 36
+    },
+    {
+      "epoch": 0.06933333333333333,
+      "grad_norm": 3.8678996562957764,
+      "learning_rate": 2.8081442660546125e-05,
+      "loss": 1.3205,
+      "num_input_tokens_seen": 103728,
+      "step": 39
+    },
+    {
+      "epoch": 0.07466666666666667,
+      "grad_norm": 0.6766985654830933,
+      "learning_rate": 1.9216926233717085e-05,
+      "loss": 1.2969,
+      "num_input_tokens_seen": 112160,
+      "step": 42
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7236246466636658,
+      "learning_rate": 1.1697777844051105e-05,
+      "loss": 1.5026,
+      "num_input_tokens_seen": 117984,
+      "step": 45
+    },
+    {
+      "epoch": 0.08533333333333333,
+      "grad_norm": 0.8583828806877136,
+      "learning_rate": 5.852620357053651e-06,
+      "loss": 1.3583,
+      "num_input_tokens_seen": 126624,
+      "step": 48
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "num_input_tokens_seen": 132048,
+      "step": 50,
+      "total_flos": 5638623387844608.0,
+      "train_loss": 1.4425424909591675,
+      "train_runtime": 174.0133,
+      "train_samples_per_second": 4.597,
+      "train_steps_per_second": 0.287,
+      "train_tokens_per_second": 1140.143
+    }
+  ],
+  "logging_steps": 3,
+  "max_steps": 50,
+  "num_input_tokens_seen": 132048,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5638623387844608.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/training_args.bin
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/training_args.bin
--- a/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/training_loss.png
+++ b/results/lora_sft_Qwen-7B_8_gpu_50_step_20240918153046/training_loss.png
				`@ -0,0 +1 @@`
				`{"cur_time": "2024-09-18 15:28:38", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}`
				`@ -0,0 +1 @@`
				`{"cur_time": "2024-09-18 15:29:45", "npu_power_dissipation": [{"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }], "device_mem_usage": []}`