testrun: single card

2024-09-18 16:33:23 +08:00 · 2024-09-18 16:33:23 +08:00 · 63556d6571
parent 43f4c8f914
commit 63556d6571
29 changed files with 304714 additions and 0 deletions
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/README.md
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/README.md
@ -0,0 +1,63 @@
 ---
 base_model: ../../../models/qwen
 library_name: peft
 license: other
 tags:
 - llama-factory
 - lora
 - generated_from_trainer
 model-index:
 - name: lora_sft_Qwen-7B_1_gpu_50_step_20240918162245
  results: []
 ---
 <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 should probably proofread and complete it, then remove this comment. -->
 # lora_sft_Qwen-7B_1_gpu_50_step_20240918162245
 This model is a fine-tuned version of [../../../models/qwen](https://huggingface.co/../../../models/qwen) on the belle_1m dataset.
 It achieves the following results on the evaluation set:
 - Loss: 1.3353
 - Num Input Tokens Seen: 132048
 ## Model description
 More information needed
 ## Intended uses & limitations
 More information needed
 ## Training and evaluation data
 More information needed
 ## Training procedure
 ### Training hyperparameters
 The following hyperparameters were used during training:
 - learning_rate: 0.0001
 - train_batch_size: 2
 - eval_batch_size: 2
 - seed: 42
 - gradient_accumulation_steps: 8
 - total_train_batch_size: 16
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_ratio: 0.1
 - training_steps: 50
 - mixed_precision_training: Native AMP
 ### Training results
 ### Framework versions
 - PEFT 0.12.0
 - Transformers 4.43.4
 - Pytorch 2.1.0
 - Datasets 2.20.0
 - Tokenizers 0.19.1
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/adapter_config.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/adapter_config.json
@ -0,0 +1,31 @@
 {
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "../../../models/qwen",
  "bias": "none",
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layer_replication": null,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 16,
  "lora_dropout": 0.0,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 8,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "w2",
    "c_proj",
    "w1",
    "c_attn"
  ],
  "task_type": "CAUSAL_LM",
  "use_dora": false,
  "use_rslora": false
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/adapter_model.safetensors
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/adapter_model.safetensors
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/all_results.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/all_results.json
@ -0,0 +1,14 @@
 {
    "epoch": 0.08888888888888889,
    "eval_loss": 1.3352934122085571,
    "eval_runtime": 54.3905,
    "eval_samples_per_second": 18.386,
    "eval_steps_per_second": 9.193,
    "num_input_tokens_seen": 132048,
    "total_flos": 5638623387844608.0,
    "train_loss": 1.4425424814224244,
    "train_runtime": 174.5036,
    "train_samples_per_second": 4.584,
    "train_steps_per_second": 0.287,
    "train_tokens_per_second": 1136.939
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/README.md
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/README.md
@ -0,0 +1,202 @@
 ---
 base_model: ../../../models/qwen
 library_name: peft
 ---
 # Model Card for Model ID
 <!-- Provide a quick summary of what the model is/does. -->
 ## Model Details
 ### Model Description
 <!-- Provide a longer summary of what this model is. -->
 - **Developed by:** [More Information Needed]
 - **Funded by [optional]:** [More Information Needed]
 - **Shared by [optional]:** [More Information Needed]
 - **Model type:** [More Information Needed]
 - **Language(s) (NLP):** [More Information Needed]
 - **License:** [More Information Needed]
 - **Finetuned from model [optional]:** [More Information Needed]
 ### Model Sources [optional]
 <!-- Provide the basic links for the model. -->
 - **Repository:** [More Information Needed]
 - **Paper [optional]:** [More Information Needed]
 - **Demo [optional]:** [More Information Needed]
 ## Uses
 <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
 ### Direct Use
 <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
 [More Information Needed]
 ### Downstream Use [optional]
 <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
 [More Information Needed]
 ### Out-of-Scope Use
 <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
 [More Information Needed]
 ## Bias, Risks, and Limitations
 <!-- This section is meant to convey both technical and sociotechnical limitations. -->
 [More Information Needed]
 ### Recommendations
 <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
 Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
 ## How to Get Started with the Model
 Use the code below to get started with the model.
 [More Information Needed]
 ## Training Details
 ### Training Data
 <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
 [More Information Needed]
 ### Training Procedure
 <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
 #### Preprocessing [optional]
 [More Information Needed]
 #### Training Hyperparameters
 - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
 #### Speeds, Sizes, Times [optional]
 <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
 [More Information Needed]
 ## Evaluation
 <!-- This section describes the evaluation protocols and provides the results. -->
 ### Testing Data, Factors & Metrics
 #### Testing Data
 <!-- This should link to a Dataset Card if possible. -->
 [More Information Needed]
 #### Factors
 <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
 [More Information Needed]
 #### Metrics
 <!-- These are the evaluation metrics being used, ideally with a description of why. -->
 [More Information Needed]
 ### Results
 [More Information Needed]
 #### Summary
 ## Model Examination [optional]
 <!-- Relevant interpretability work for the model goes here -->
 [More Information Needed]
 ## Environmental Impact
 <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
 Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
 - **Hardware Type:** [More Information Needed]
 - **Hours used:** [More Information Needed]
 - **Cloud Provider:** [More Information Needed]
 - **Compute Region:** [More Information Needed]
 - **Carbon Emitted:** [More Information Needed]
 ## Technical Specifications [optional]
 ### Model Architecture and Objective
 [More Information Needed]
 ### Compute Infrastructure
 [More Information Needed]
 #### Hardware
 [More Information Needed]
 #### Software
 [More Information Needed]
 ## Citation [optional]
 <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
 **BibTeX:**
 [More Information Needed]
 **APA:**
 [More Information Needed]
 ## Glossary [optional]
 <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
 [More Information Needed]
 ## More Information [optional]
 [More Information Needed]
 ## Model Card Authors [optional]
 [More Information Needed]
 ## Model Card Contact
 [More Information Needed]
 ### Framework versions
 - PEFT 0.12.0
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/adapter_config.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/adapter_config.json
@ -0,0 +1,31 @@
 {
  "alpha_pattern": {},
  "auto_mapping": null,
  "base_model_name_or_path": "../../../models/qwen",
  "bias": "none",
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layer_replication": null,
  "layers_pattern": null,
  "layers_to_transform": null,
  "loftq_config": {},
  "lora_alpha": 16,
  "lora_dropout": 0.0,
  "megatron_config": null,
  "megatron_core": "megatron.core",
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 8,
  "rank_pattern": {},
  "revision": null,
  "target_modules": [
    "w2",
    "c_proj",
    "w1",
    "c_attn"
  ],
  "task_type": "CAUSAL_LM",
  "use_dora": false,
  "use_rslora": false
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/adapter_model.safetensors
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/adapter_model.safetensors
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/optimizer.pt
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/optimizer.pt
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/qwen.tiktoken
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/qwen.tiktoken
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/rng_state.pth
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/rng_state.pth
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/scheduler.pt
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/scheduler.pt
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/special_tokens_map.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/special_tokens_map.json
@ -0,0 +1,10 @@
 {
  "eos_token": {
    "content": "<|im_end|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": "<|im_end|>"
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/tokenization_qwen.py
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/tokenization_qwen.py
@ -0,0 +1,276 @@
 # Copyright (c) Alibaba Cloud.
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 """Tokenization classes for QWen."""
 import base64
 import logging
 import os
 import unicodedata
 from typing import Collection, Dict, List, Set, Tuple, Union
 import tiktoken
 from transformers import PreTrainedTokenizer, AddedToken
 logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
 PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
 ENDOFTEXT = "<|endoftext|>"
 IMSTART = "<|im_start|>"
 IMEND = "<|im_end|>"
 # as the default behavior is changed to allow special tokens in
 # regular texts, the surface forms of special tokens need to be
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
 # changed to use actual index to avoid misconfiguration with vocabulary expansion
 SPECIAL_START_ID = 151643
 SPECIAL_TOKENS = tuple(
    enumerate(
        (
            (
                ENDOFTEXT,
                IMSTART,
                IMEND,
            )
            + EXTRAS
        ),
        start=SPECIAL_START_ID,
    )
 )
 SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
    with open(tiktoken_bpe_file, "rb") as f:
        contents = f.read()
    return {
        base64.b64decode(token): int(rank)
        for token, rank in (line.split() for line in contents.splitlines() if line)
    }
 class QWenTokenizer(PreTrainedTokenizer):
    """QWen tokenizer."""
    vocab_files_names = VOCAB_FILES_NAMES
    def __init__(
        self,
        vocab_file,
        errors="replace",
        extra_vocab_file=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # how to handle errors in decoding UTF-8 byte sequences
        # use ignore if you are in streaming inference
        self.errors = errors  
        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
        self.special_tokens = {
            token: index
            for index, token in SPECIAL_TOKENS
        }
        # try load extra vocab from file
        if extra_vocab_file is not None:
            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
            for token, index in extra_mergeable_ranks.items():
                if token in self.mergeable_ranks:
                    logger.info(f"extra token {token} exists, skipping")
                    continue
                if index in used_ids:
                    logger.info(f'the index {index} for extra token {token} exists, skipping')
                    continue
                self.mergeable_ranks[token] = index
            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
        enc = tiktoken.Encoding(
            "Qwen",
            pat_str=PAT_STR,
            mergeable_ranks=self.mergeable_ranks,
            special_tokens=self.special_tokens,
        )
        assert (
            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
        self.decoder = {
            v: k for k, v in self.mergeable_ranks.items()
        }  # type: dict[int, bytes|str]
        self.decoder.update({v: k for k, v in self.special_tokens.items()})
        self.tokenizer = enc  # type: tiktoken.Encoding
        self.eod_id = self.tokenizer.eot_token
        self.im_start_id = self.special_tokens[IMSTART]
        self.im_end_id = self.special_tokens[IMEND]
    def __getstate__(self):
        # for pickle lovers
        state = self.__dict__.copy()
        del state["tokenizer"]
        return state
    def __setstate__(self, state):
        # tokenizer is not python native; don't pass it; rebuild it
        self.__dict__.update(state)
        enc = tiktoken.Encoding(
            "Qwen",
            pat_str=PAT_STR,
            mergeable_ranks=self.mergeable_ranks,
            special_tokens=self.special_tokens,
        )
        self.tokenizer = enc
    def __len__(self) -> int:
        return self.tokenizer.n_vocab
    def get_vocab(self) -> Dict[bytes, int]:
        return self.mergeable_ranks
    def convert_tokens_to_ids(
        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
    ) -> List[int]:
        ids = []
        if isinstance(tokens, (str, bytes)):
            if tokens in self.special_tokens:
                return self.special_tokens[tokens]
            else:
                return self.mergeable_ranks.get(tokens)
        for token in tokens:
            if token in self.special_tokens:
                ids.append(self.special_tokens[token])
            else:
                ids.append(self.mergeable_ranks.get(token))
        return ids
    def _add_tokens(
        self,
        new_tokens: Union[List[str], List[AddedToken]],
        special_tokens: bool = False,
    ) -> int:
        if not special_tokens and new_tokens:
            raise ValueError("Adding regular tokens is not supported")
        for token in new_tokens:
            surface_form = token.content if isinstance(token, AddedToken) else token
            if surface_form not in SPECIAL_TOKENS_SET:
                raise ValueError("Adding unknown special tokens is not supported")
        return 0
    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
        """
        Save only the vocabulary of the tokenizer (vocabulary).
        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        file_path = os.path.join(save_directory, "qwen.tiktoken")
        with open(file_path, "w", encoding="utf8") as w:
            for k, v in self.mergeable_ranks.items():
                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
                w.write(line)
        return (file_path,)
    def tokenize(
        self,
        text: str,
        allowed_special: Union[Set, str] = "all",
        disallowed_special: Union[Collection, str] = (),
        **kwargs,
    ) -> List[Union[bytes, str]]:
        """
        Converts a string in a sequence of tokens.
        Args:
            text (`str`):
                The sequence to be encoded.
            allowed_special (`Literal["all"]` or `set`):
                The surface forms of the tokens to be encoded as special tokens in regular texts.
                Default to "all".
            disallowed_special (`Literal["all"]` or `Collection`):
                The surface forms of the tokens that should not be in regular texts and trigger errors.
                Default to an empty tuple.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific encode method.
        Returns:
            `List[bytes|str]`: The list of tokens.
        """
        tokens = []
        text = unicodedata.normalize("NFC", text)
        # this implementation takes a detour: text -> token id -> token surface forms
        for t in self.tokenizer.encode(
            text, allowed_special=allowed_special, disallowed_special=disallowed_special
        ):
            tokens.append(self.decoder[t])
        return tokens
    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
        """
        Converts a sequence of tokens in a single string.
        """
        text = ""
        temp = b""
        for t in tokens:
            if isinstance(t, str):
                if temp:
                    text += temp.decode("utf-8", errors=self.errors)
                    temp = b""
                text += t
            elif isinstance(t, bytes):
                temp += t
            else:
                raise TypeError("token should only be of type types or str")
        if temp:
            text += temp.decode("utf-8", errors=self.errors)
        return text
    @property
    def vocab_size(self):
        return self.tokenizer.n_vocab
    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
        """Converts an id to a token, special tokens included"""
        if index in self.decoder:
            return self.decoder[index]
        raise ValueError("unknown ids")
    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
        """Converts a token to an id using the vocab, special tokens included"""
        if token in self.special_tokens:
            return self.special_tokens[token]
        if token in self.mergeable_ranks:
            return self.mergeable_ranks[token]
        raise ValueError("unknown token")
    def _tokenize(self, text: str, **kwargs):
        """
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
        Do NOT take care of added tokens.
        """
        raise NotImplementedError
    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        errors: str = None,
        **kwargs,
    ) -> str:
        if isinstance(token_ids, int):
            token_ids = [token_ids]
        if skip_special_tokens:
            token_ids = [i for i in token_ids if i < self.eod_id]
        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/tokenizer_config.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/tokenizer_config.json
@ -0,0 +1,17 @@
 {
  "added_tokens_decoder": {},
  "auto_map": {
    "AutoTokenizer": [
      "tokenization_qwen.QWenTokenizer",
      null
    ]
  },
  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
  "clean_up_tokenization_spaces": true,
  "eos_token": "<|im_end|>",
  "model_max_length": 32768,
  "pad_token": "<|im_end|>",
  "padding_side": "right",
  "split_special_tokens": false,
  "tokenizer_class": "QWenTokenizer"
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/trainer_state.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/trainer_state.json
@ -0,0 +1,161 @@
 {
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.08888888888888889,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.005333333333333333,
      "grad_norm": 0.8999722599983215,
      "learning_rate": 4e-05,
      "loss": 1.5189,
      "num_input_tokens_seen": 9808,
      "step": 3
    },
    {
      "epoch": 0.010666666666666666,
      "grad_norm": NaN,
      "learning_rate": 6e-05,
      "loss": 1.5504,
      "num_input_tokens_seen": 19312,
      "step": 6
    },
    {
      "epoch": 0.016,
      "grad_norm": 0.9268227219581604,
      "learning_rate": 9.987820251299122e-05,
      "loss": 1.5661,
      "num_input_tokens_seen": 29232,
      "step": 9
    },
    {
      "epoch": 0.021333333333333333,
      "grad_norm": 1.1588999032974243,
      "learning_rate": 9.806308479691595e-05,
      "loss": 1.7033,
      "num_input_tokens_seen": 37984,
      "step": 12
    },
    {
      "epoch": 0.02666666666666667,
      "grad_norm": 1.0571134090423584,
      "learning_rate": 9.567727288213005e-05,
      "loss": 1.4225,
      "num_input_tokens_seen": 44592,
      "step": 15
    },
    {
      "epoch": 0.032,
      "grad_norm": 1.720107913017273,
      "learning_rate": 9.24024048078213e-05,
      "loss": 1.4217,
      "num_input_tokens_seen": 52400,
      "step": 18
    },
    {
      "epoch": 0.037333333333333336,
      "grad_norm": 0.930574893951416,
      "learning_rate": 8.596699001693255e-05,
      "loss": 1.2793,
      "num_input_tokens_seen": 60320,
      "step": 21
    },
    {
      "epoch": 0.042666666666666665,
      "grad_norm": 1.6979925632476807,
      "learning_rate": 7.795964517353735e-05,
      "loss": 1.4875,
      "num_input_tokens_seen": 67024,
      "step": 24
    },
    {
      "epoch": 0.048,
      "grad_norm": 2.2298834323883057,
      "learning_rate": 6.873032967079561e-05,
      "loss": 1.2446,
      "num_input_tokens_seen": 73776,
      "step": 27
    },
    {
      "epoch": 0.05333333333333334,
      "grad_norm": 1.6609553098678589,
      "learning_rate": 5.868240888334653e-05,
      "loss": 1.4691,
      "num_input_tokens_seen": 82592,
      "step": 30
    },
    {
      "epoch": 0.058666666666666666,
      "grad_norm": 1.1659108400344849,
      "learning_rate": 4.825502516487497e-05,
      "loss": 1.4451,
      "num_input_tokens_seen": 90512,
      "step": 33
    },
    {
      "epoch": 0.064,
      "grad_norm": 1.2636826038360596,
      "learning_rate": 3.790390522001662e-05,
      "loss": 1.4139,
      "num_input_tokens_seen": 96848,
      "step": 36
    },
    {
      "epoch": 0.06933333333333333,
      "grad_norm": 3.8678996562957764,
      "learning_rate": 2.8081442660546125e-05,
      "loss": 1.3205,
      "num_input_tokens_seen": 103728,
      "step": 39
    },
    {
      "epoch": 0.07466666666666667,
      "grad_norm": 0.6766985654830933,
      "learning_rate": 1.9216926233717085e-05,
      "loss": 1.2969,
      "num_input_tokens_seen": 112160,
      "step": 42
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.7236246466636658,
      "learning_rate": 1.1697777844051105e-05,
      "loss": 1.5026,
      "num_input_tokens_seen": 117984,
      "step": 45
    },
    {
      "epoch": 0.08533333333333333,
      "grad_norm": 0.8583828806877136,
      "learning_rate": 5.852620357053651e-06,
      "loss": 1.3583,
      "num_input_tokens_seen": 126624,
      "step": 48
    }
  ],
  "logging_steps": 3,
  "max_steps": 50,
  "num_input_tokens_seen": 132048,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 5638623387844608.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/training_args.bin
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/checkpoint-50/training_args.bin
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/eval_results.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/eval_results.json
@ -0,0 +1,8 @@
 {
    "epoch": 0.08888888888888889,
    "eval_loss": 1.3352934122085571,
    "eval_runtime": 54.3905,
    "eval_samples_per_second": 18.386,
    "eval_steps_per_second": 9.193,
    "num_input_tokens_seen": 132048
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/gpu_status_20240918162245.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/gpu_status_20240918162245.json
@ -0,0 +1,7 @@
 {"cur_time": "2024-09-18 16:22:51", "npu_power_dissipation": [{"npu_id": 0, "power_dissipation": }, {"npu_id": 1, "power_dissipation": }, {"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 4, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }, {"npu_id": 7, "power_dissipation": }], "device_mem_usage": []}
 {"cur_time": "2024-09-18 16:23:56", "npu_power_dissipation": [{"npu_id": 0, "power_dissipation": }, {"npu_id": 1, "power_dissipation": }, {"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 4, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }, {"npu_id": 7, "power_dissipation": }], "device_mem_usage": []}
 {"cur_time": "2024-09-18 16:25:01", "npu_power_dissipation": [{"npu_id": 0, "power_dissipation": }, {"npu_id": 1, "power_dissipation": }, {"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 4, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }, {"npu_id": 7, "power_dissipation": }], "device_mem_usage": []}
 {"cur_time": "2024-09-18 16:26:05", "npu_power_dissipation": [{"npu_id": 0, "power_dissipation": }, {"npu_id": 1, "power_dissipation": }, {"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 4, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }, {"npu_id": 7, "power_dissipation": }], "device_mem_usage": []}
 {"cur_time": "2024-09-18 16:27:10", "npu_power_dissipation": [{"npu_id": 0, "power_dissipation": }, {"npu_id": 1, "power_dissipation": }, {"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 4, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }, {"npu_id": 7, "power_dissipation": }], "device_mem_usage": []}
 {"cur_time": "2024-09-18 16:28:15", "npu_power_dissipation": [{"npu_id": 0, "power_dissipation": }, {"npu_id": 1, "power_dissipation": }, {"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 4, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }, {"npu_id": 7, "power_dissipation": }], "device_mem_usage": []}
 {"cur_time": "2024-09-18 16:29:19", "npu_power_dissipation": [{"npu_id": 0, "power_dissipation": }, {"npu_id": 1, "power_dissipation": }, {"npu_id": 2, "power_dissipation": }, {"npu_id": 3, "power_dissipation": }, {"npu_id": 4, "power_dissipation": }, {"npu_id": 5, "power_dissipation": }, {"npu_id": 6, "power_dissipation": }, {"npu_id": 7, "power_dissipation": }], "device_mem_usage": []}
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/log.txt
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/log.txt
@ -0,0 +1,75 @@
 [2024-09-18 16:23:02,978] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to npu (auto detect)
 [93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
 [93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
 09/18/2024 16:23:08 - INFO - llamafactory.hparams.parser - Process rank: 0, device: npu:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16
 09/18/2024 16:23:09 - INFO - llamafactory.data.template - Add eos token: <|im_end|>
 09/18/2024 16:23:09 - INFO - llamafactory.data.template - Add pad token: <|im_end|>
 09/18/2024 16:23:09 - INFO - llamafactory.data.loader - Loading dataset AI-ModelScope/train_1M_CN...
 training example:
 input_ids:
 [151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 104317, 89012, 22382, 106096, 64471, 101137, 72881, 102648, 46448, 1773, 62244, 107132, 37945, 99553, 25177, 101898, 8997, 100431, 99639, 113773, 9370, 111749, 25, 330, 100012, 105435, 99487, 100220, 3837, 104817, 44063, 99553, 102322, 20074, 33108, 116993, 3837, 23031, 104022, 100147, 101313, 1773, 698, 151645, 198, 151644, 77091, 198, 99487, 111749, 101137, 72881, 102648, 46448, 1773, 151645]
 inputs:
 <|im_start|>system
 You are a helpful assistant.<|im_end|>
 <|im_start|>user
 判断给定的文章是否符合语法规则。如果不符合，请提供修改建议。
 下面是一篇文章的开头: "为了探讨这个主题，本文将提供一系列数据和实例，以证明这一观点。"
 <|im_end|>
 <|im_start|>assistant
 这个开头符合语法规则。<|im_end|>
 label_ids:
 [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 99487, 111749, 101137, 72881, 102648, 46448, 1773, 151645]
 labels:
 这个开头符合语法规则。<|im_end|>
 09/18/2024 16:26:20 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
 09/18/2024 16:26:20 - INFO - llamafactory.model.model_utils.attention - Using vanilla attention implementation.
 09/18/2024 16:26:20 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
 09/18/2024 16:26:20 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA
 09/18/2024 16:26:20 - INFO - llamafactory.model.model_utils.misc - Found linear modules: w2,c_proj,w1,c_attn
 09/18/2024 16:26:21 - INFO - llamafactory.model.loader - trainable params: 17,891,328 || all params: 7,739,215,872 || trainable%: 0.2312
 Gradient overflow. Skipping step
 Loss scaler reducing loss scale to 32768.0
 {'loss': 1.5189, 'grad_norm': 0.8999722599983215, 'learning_rate': 4e-05, 'epoch': 0.01, 'num_input_tokens_seen': 9808}
 Gradient overflow. Skipping step
 Loss scaler reducing loss scale to 16384.0
 Gradient overflow. Skipping step
 Loss scaler reducing loss scale to 8192.0
 {'loss': 1.5504, 'grad_norm': nan, 'learning_rate': 6e-05, 'epoch': 0.01, 'num_input_tokens_seen': 19312}
 {'loss': 1.5661, 'grad_norm': 0.9268227219581604, 'learning_rate': 9.987820251299122e-05, 'epoch': 0.02, 'num_input_tokens_seen': 29232}
 {'loss': 1.7033, 'grad_norm': 1.1588999032974243, 'learning_rate': 9.806308479691595e-05, 'epoch': 0.02, 'num_input_tokens_seen': 37984}
 Gradient overflow. Skipping step
 Loss scaler reducing loss scale to 4096.0
 {'loss': 1.4225, 'grad_norm': 1.0571134090423584, 'learning_rate': 9.567727288213005e-05, 'epoch': 0.03, 'num_input_tokens_seen': 44592}
 Gradient overflow. Skipping step
 Loss scaler reducing loss scale to 2048.0
 {'loss': 1.4217, 'grad_norm': 1.720107913017273, 'learning_rate': 9.24024048078213e-05, 'epoch': 0.03, 'num_input_tokens_seen': 52400}
 {'loss': 1.2793, 'grad_norm': 0.930574893951416, 'learning_rate': 8.596699001693255e-05, 'epoch': 0.04, 'num_input_tokens_seen': 60320}
 {'loss': 1.4875, 'grad_norm': 1.6979925632476807, 'learning_rate': 7.795964517353735e-05, 'epoch': 0.04, 'num_input_tokens_seen': 67024}
 {'loss': 1.2446, 'grad_norm': 2.2298834323883057, 'learning_rate': 6.873032967079561e-05, 'epoch': 0.05, 'num_input_tokens_seen': 73776}
 {'loss': 1.4691, 'grad_norm': 1.6609553098678589, 'learning_rate': 5.868240888334653e-05, 'epoch': 0.05, 'num_input_tokens_seen': 82592}
 {'loss': 1.4451, 'grad_norm': 1.1659108400344849, 'learning_rate': 4.825502516487497e-05, 'epoch': 0.06, 'num_input_tokens_seen': 90512}
 {'loss': 1.4139, 'grad_norm': 1.2636826038360596, 'learning_rate': 3.790390522001662e-05, 'epoch': 0.06, 'num_input_tokens_seen': 96848}
 {'loss': 1.3205, 'grad_norm': 3.8678996562957764, 'learning_rate': 2.8081442660546125e-05, 'epoch': 0.07, 'num_input_tokens_seen': 103728}
 {'loss': 1.2969, 'grad_norm': 0.6766985654830933, 'learning_rate': 1.9216926233717085e-05, 'epoch': 0.07, 'num_input_tokens_seen': 112160}
 {'loss': 1.5026, 'grad_norm': 0.7236246466636658, 'learning_rate': 1.1697777844051105e-05, 'epoch': 0.08, 'num_input_tokens_seen': 117984}
 {'loss': 1.3583, 'grad_norm': 0.8583828806877136, 'learning_rate': 5.852620357053651e-06, 'epoch': 0.09, 'num_input_tokens_seen': 126624}
 {'train_runtime': 174.5036, 'train_samples_per_second': 4.584, 'train_steps_per_second': 0.287, 'train_tokens_per_second': 1136.939, 'train_loss': 1.4425424814224244, 'epoch': 0.09, 'num_input_tokens_seen': 132048}
 ***** train metrics *****
  epoch                    =     0.0889
  num_input_tokens_seen    =     132048
  total_flos               =  5251377GF
  train_loss               =     1.4425
  train_runtime            = 0:02:54.50
  train_samples_per_second =      4.584
  train_steps_per_second   =      0.287
  train_tokens_per_second  =   1136.939
 Figure saved at: ./results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/training_loss.png
 09/18/2024 16:29:17 - WARNING - llamafactory.extras.ploting - No metric eval_loss to plot.
 09/18/2024 16:29:17 - WARNING - llamafactory.extras.ploting - No metric eval_accuracy to plot.
 ***** eval metrics *****
  epoch                   =     0.0889
  eval_loss               =     1.3353
  eval_runtime            = 0:00:54.39
  eval_samples_per_second =     18.386
  eval_steps_per_second   =      9.193
  num_input_tokens_seen   =     132048
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245.yaml
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245.yaml
@ -0,0 +1,31 @@
 cutoff_len: 1024
 dataset: belle_1m
 ddp_timeout: 180000000
 do_train: true
 eval_steps: 500
 eval_strategy: steps
 finetuning_type: lora
 fp16: true
 gradient_accumulation_steps: 8
 include_num_input_tokens_seen: true
 include_tokens_per_second: true
 learning_rate: 0.0001
 logging_steps: 3
 lora_target: all
 lr_scheduler_type: cosine
 max_samples: 10000
 max_steps: 50
 model_name_or_path: ../../../models/qwen
 num_train_epochs: 10.0
 output_dir: ./results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245
 overwrite_cache: true
 overwrite_output_dir: true
 per_device_eval_batch_size: 2
 per_device_train_batch_size: 2
 plot_loss: true
 preprocessing_num_workers: 16
 save_steps: 500
 stage: sft
 template: qwen
 val_size: 0.1
 warmup_ratio: 0.1
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/qwen.tiktoken
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/qwen.tiktoken
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/special_tokens_map.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/special_tokens_map.json
@ -0,0 +1,10 @@
 {
  "eos_token": {
    "content": "<|im_end|>",
    "lstrip": false,
    "normalized": false,
    "rstrip": false,
    "single_word": false
  },
  "pad_token": "<|im_end|>"
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/tokenization_qwen.py
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/tokenization_qwen.py
@ -0,0 +1,276 @@
 # Copyright (c) Alibaba Cloud.
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 """Tokenization classes for QWen."""
 import base64
 import logging
 import os
 import unicodedata
 from typing import Collection, Dict, List, Set, Tuple, Union
 import tiktoken
 from transformers import PreTrainedTokenizer, AddedToken
 logger = logging.getLogger(__name__)
 VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
 PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
 ENDOFTEXT = "<|endoftext|>"
 IMSTART = "<|im_start|>"
 IMEND = "<|im_end|>"
 # as the default behavior is changed to allow special tokens in
 # regular texts, the surface forms of special tokens need to be
 # as different as possible to minimize the impact
 EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
 # changed to use actual index to avoid misconfiguration with vocabulary expansion
 SPECIAL_START_ID = 151643
 SPECIAL_TOKENS = tuple(
    enumerate(
        (
            (
                ENDOFTEXT,
                IMSTART,
                IMEND,
            )
            + EXTRAS
        ),
        start=SPECIAL_START_ID,
    )
 )
 SPECIAL_TOKENS_SET = set(t for i, t in SPECIAL_TOKENS)
 def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
    with open(tiktoken_bpe_file, "rb") as f:
        contents = f.read()
    return {
        base64.b64decode(token): int(rank)
        for token, rank in (line.split() for line in contents.splitlines() if line)
    }
 class QWenTokenizer(PreTrainedTokenizer):
    """QWen tokenizer."""
    vocab_files_names = VOCAB_FILES_NAMES
    def __init__(
        self,
        vocab_file,
        errors="replace",
        extra_vocab_file=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        # how to handle errors in decoding UTF-8 byte sequences
        # use ignore if you are in streaming inference
        self.errors = errors  
        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: Dict[bytes, int]
        self.special_tokens = {
            token: index
            for index, token in SPECIAL_TOKENS
        }
        # try load extra vocab from file
        if extra_vocab_file is not None:
            used_ids = set(self.mergeable_ranks.values()) | set(self.special_tokens.values())
            extra_mergeable_ranks = _load_tiktoken_bpe(extra_vocab_file)
            for token, index in extra_mergeable_ranks.items():
                if token in self.mergeable_ranks:
                    logger.info(f"extra token {token} exists, skipping")
                    continue
                if index in used_ids:
                    logger.info(f'the index {index} for extra token {token} exists, skipping')
                    continue
                self.mergeable_ranks[token] = index
            # the index may be sparse after this, but don't worry tiktoken.Encoding will handle this
        enc = tiktoken.Encoding(
            "Qwen",
            pat_str=PAT_STR,
            mergeable_ranks=self.mergeable_ranks,
            special_tokens=self.special_tokens,
        )
        assert (
            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
        self.decoder = {
            v: k for k, v in self.mergeable_ranks.items()
        }  # type: dict[int, bytes|str]
        self.decoder.update({v: k for k, v in self.special_tokens.items()})
        self.tokenizer = enc  # type: tiktoken.Encoding
        self.eod_id = self.tokenizer.eot_token
        self.im_start_id = self.special_tokens[IMSTART]
        self.im_end_id = self.special_tokens[IMEND]
    def __getstate__(self):
        # for pickle lovers
        state = self.__dict__.copy()
        del state["tokenizer"]
        return state
    def __setstate__(self, state):
        # tokenizer is not python native; don't pass it; rebuild it
        self.__dict__.update(state)
        enc = tiktoken.Encoding(
            "Qwen",
            pat_str=PAT_STR,
            mergeable_ranks=self.mergeable_ranks,
            special_tokens=self.special_tokens,
        )
        self.tokenizer = enc
    def __len__(self) -> int:
        return self.tokenizer.n_vocab
    def get_vocab(self) -> Dict[bytes, int]:
        return self.mergeable_ranks
    def convert_tokens_to_ids(
        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
    ) -> List[int]:
        ids = []
        if isinstance(tokens, (str, bytes)):
            if tokens in self.special_tokens:
                return self.special_tokens[tokens]
            else:
                return self.mergeable_ranks.get(tokens)
        for token in tokens:
            if token in self.special_tokens:
                ids.append(self.special_tokens[token])
            else:
                ids.append(self.mergeable_ranks.get(token))
        return ids
    def _add_tokens(
        self,
        new_tokens: Union[List[str], List[AddedToken]],
        special_tokens: bool = False,
    ) -> int:
        if not special_tokens and new_tokens:
            raise ValueError("Adding regular tokens is not supported")
        for token in new_tokens:
            surface_form = token.content if isinstance(token, AddedToken) else token
            if surface_form not in SPECIAL_TOKENS_SET:
                raise ValueError("Adding unknown special tokens is not supported")
        return 0
    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
        """
        Save only the vocabulary of the tokenizer (vocabulary).
        Returns:
            `Tuple(str)`: Paths to the files saved.
        """
        file_path = os.path.join(save_directory, "qwen.tiktoken")
        with open(file_path, "w", encoding="utf8") as w:
            for k, v in self.mergeable_ranks.items():
                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
                w.write(line)
        return (file_path,)
    def tokenize(
        self,
        text: str,
        allowed_special: Union[Set, str] = "all",
        disallowed_special: Union[Collection, str] = (),
        **kwargs,
    ) -> List[Union[bytes, str]]:
        """
        Converts a string in a sequence of tokens.
        Args:
            text (`str`):
                The sequence to be encoded.
            allowed_special (`Literal["all"]` or `set`):
                The surface forms of the tokens to be encoded as special tokens in regular texts.
                Default to "all".
            disallowed_special (`Literal["all"]` or `Collection`):
                The surface forms of the tokens that should not be in regular texts and trigger errors.
                Default to an empty tuple.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific encode method.
        Returns:
            `List[bytes|str]`: The list of tokens.
        """
        tokens = []
        text = unicodedata.normalize("NFC", text)
        # this implementation takes a detour: text -> token id -> token surface forms
        for t in self.tokenizer.encode(
            text, allowed_special=allowed_special, disallowed_special=disallowed_special
        ):
            tokens.append(self.decoder[t])
        return tokens
    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
        """
        Converts a sequence of tokens in a single string.
        """
        text = ""
        temp = b""
        for t in tokens:
            if isinstance(t, str):
                if temp:
                    text += temp.decode("utf-8", errors=self.errors)
                    temp = b""
                text += t
            elif isinstance(t, bytes):
                temp += t
            else:
                raise TypeError("token should only be of type types or str")
        if temp:
            text += temp.decode("utf-8", errors=self.errors)
        return text
    @property
    def vocab_size(self):
        return self.tokenizer.n_vocab
    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
        """Converts an id to a token, special tokens included"""
        if index in self.decoder:
            return self.decoder[index]
        raise ValueError("unknown ids")
    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
        """Converts a token to an id using the vocab, special tokens included"""
        if token in self.special_tokens:
            return self.special_tokens[token]
        if token in self.mergeable_ranks:
            return self.mergeable_ranks[token]
        raise ValueError("unknown token")
    def _tokenize(self, text: str, **kwargs):
        """
        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
        Do NOT take care of added tokens.
        """
        raise NotImplementedError
    def _decode(
        self,
        token_ids: Union[int, List[int]],
        skip_special_tokens: bool = False,
        errors: str = None,
        **kwargs,
    ) -> str:
        if isinstance(token_ids, int):
            token_ids = [token_ids]
        if skip_special_tokens:
            token_ids = [i for i in token_ids if i < self.eod_id]
        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/tokenizer_config.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/tokenizer_config.json
@ -0,0 +1,17 @@
 {
  "added_tokens_decoder": {},
  "auto_map": {
    "AutoTokenizer": [
      "tokenization_qwen.QWenTokenizer",
      null
    ]
  },
  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
  "clean_up_tokenization_spaces": true,
  "eos_token": "<|im_end|>",
  "model_max_length": 32768,
  "pad_token": "<|im_end|>",
  "padding_side": "right",
  "split_special_tokens": false,
  "tokenizer_class": "QWenTokenizer"
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/train_results.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/train_results.json
@ -0,0 +1,10 @@
 {
    "epoch": 0.08888888888888889,
    "num_input_tokens_seen": 132048,
    "total_flos": 5638623387844608.0,
    "train_loss": 1.4425424814224244,
    "train_runtime": 174.5036,
    "train_samples_per_second": 4.584,
    "train_steps_per_second": 0.287,
    "train_tokens_per_second": 1136.939
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/trainer_log.jsonl
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/trainer_log.jsonl
@ -0,0 +1,17 @@
 {"current_steps": 3, "total_steps": 50, "loss": 1.5189, "learning_rate": 4e-05, "epoch": 0.005333333333333333, "percentage": 6.0, "cur_time": "2024-09-18 16:26:34", "elapsed_time": "0:00:12", "remaining_time": "0:03:08", "throughput": 815.19, "total_tokens": 9808}
 {"current_steps": 6, "total_steps": 50, "loss": 1.5504, "learning_rate": 6e-05, "epoch": 0.010666666666666666, "percentage": 12.0, "cur_time": "2024-09-18 16:26:44", "elapsed_time": "0:00:22", "remaining_time": "0:02:43", "throughput": 864.66, "total_tokens": 19312}
 {"current_steps": 9, "total_steps": 50, "loss": 1.5661, "learning_rate": 9.987820251299122e-05, "epoch": 0.016, "percentage": 18.0, "cur_time": "2024-09-18 16:26:54", "elapsed_time": "0:00:32", "remaining_time": "0:02:28", "throughput": 896.34, "total_tokens": 29232}
 {"current_steps": 12, "total_steps": 50, "loss": 1.7033, "learning_rate": 9.806308479691595e-05, "epoch": 0.021333333333333333, "percentage": 24.0, "cur_time": "2024-09-18 16:27:04", "elapsed_time": "0:00:42", "remaining_time": "0:02:15", "throughput": 889.29, "total_tokens": 37984}
 {"current_steps": 15, "total_steps": 50, "loss": 1.4225, "learning_rate": 9.567727288213005e-05, "epoch": 0.02666666666666667, "percentage": 30.0, "cur_time": "2024-09-18 16:27:15", "elapsed_time": "0:00:52", "remaining_time": "0:02:03", "throughput": 844.36, "total_tokens": 44592}
 {"current_steps": 18, "total_steps": 50, "loss": 1.4217, "learning_rate": 9.24024048078213e-05, "epoch": 0.032, "percentage": 36.0, "cur_time": "2024-09-18 16:27:25", "elapsed_time": "0:01:03", "remaining_time": "0:01:52", "throughput": 828.43, "total_tokens": 52400}
 {"current_steps": 21, "total_steps": 50, "loss": 1.2793, "learning_rate": 8.596699001693255e-05, "epoch": 0.037333333333333336, "percentage": 42.0, "cur_time": "2024-09-18 16:27:35", "elapsed_time": "0:01:13", "remaining_time": "0:01:41", "throughput": 822.49, "total_tokens": 60320}
 {"current_steps": 24, "total_steps": 50, "loss": 1.4875, "learning_rate": 7.795964517353735e-05, "epoch": 0.042666666666666665, "percentage": 48.0, "cur_time": "2024-09-18 16:27:45", "elapsed_time": "0:01:23", "remaining_time": "0:01:30", "throughput": 802.33, "total_tokens": 67024}
 {"current_steps": 27, "total_steps": 50, "loss": 1.2446, "learning_rate": 6.873032967079561e-05, "epoch": 0.048, "percentage": 54.0, "cur_time": "2024-09-18 16:27:55", "elapsed_time": "0:01:33", "remaining_time": "0:01:19", "throughput": 787.52, "total_tokens": 73776}
 {"current_steps": 30, "total_steps": 50, "loss": 1.4691, "learning_rate": 5.868240888334653e-05, "epoch": 0.05333333333333334, "percentage": 60.0, "cur_time": "2024-09-18 16:28:06", "elapsed_time": "0:01:44", "remaining_time": "0:01:09", "throughput": 792.62, "total_tokens": 82592}
 {"current_steps": 33, "total_steps": 50, "loss": 1.4451, "learning_rate": 4.825502516487497e-05, "epoch": 0.058666666666666666, "percentage": 66.0, "cur_time": "2024-09-18 16:28:16", "elapsed_time": "0:01:54", "remaining_time": "0:00:59", "throughput": 789.55, "total_tokens": 90512}
 {"current_steps": 36, "total_steps": 50, "loss": 1.4139, "learning_rate": 3.790390522001662e-05, "epoch": 0.064, "percentage": 72.0, "cur_time": "2024-09-18 16:28:27", "elapsed_time": "0:02:05", "remaining_time": "0:00:48", "throughput": 773.29, "total_tokens": 96848}
 {"current_steps": 39, "total_steps": 50, "loss": 1.3205, "learning_rate": 2.8081442660546125e-05, "epoch": 0.06933333333333333, "percentage": 78.0, "cur_time": "2024-09-18 16:28:37", "elapsed_time": "0:02:15", "remaining_time": "0:00:38", "throughput": 765.37, "total_tokens": 103728}
 {"current_steps": 42, "total_steps": 50, "loss": 1.2969, "learning_rate": 1.9216926233717085e-05, "epoch": 0.07466666666666667, "percentage": 84.0, "cur_time": "2024-09-18 16:28:48", "elapsed_time": "0:02:25", "remaining_time": "0:00:27", "throughput": 769.02, "total_tokens": 112160}
 {"current_steps": 45, "total_steps": 50, "loss": 1.5026, "learning_rate": 1.1697777844051105e-05, "epoch": 0.08, "percentage": 90.0, "cur_time": "2024-09-18 16:28:58", "elapsed_time": "0:02:35", "remaining_time": "0:00:17", "throughput": 757.01, "total_tokens": 117984}
 {"current_steps": 48, "total_steps": 50, "loss": 1.3583, "learning_rate": 5.852620357053651e-06, "epoch": 0.08533333333333333, "percentage": 96.0, "cur_time": "2024-09-18 16:29:08", "elapsed_time": "0:02:46", "remaining_time": "0:00:06", "throughput": 761.68, "total_tokens": 126624}
 {"current_steps": 50, "total_steps": 50, "epoch": 0.08888888888888889, "percentage": 100.0, "cur_time": "2024-09-18 16:29:16", "elapsed_time": "0:02:54", "remaining_time": "0:00:00", "throughput": 756.73, "total_tokens": 132048}
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/trainer_state.json
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/trainer_state.json
@ -0,0 +1,172 @@
 {
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.08888888888888889,
  "eval_steps": 500,
  "global_step": 50,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.005333333333333333,
      "grad_norm": 0.8999722599983215,
      "learning_rate": 4e-05,
      "loss": 1.5189,
      "num_input_tokens_seen": 9808,
      "step": 3
    },
    {
      "epoch": 0.010666666666666666,
      "grad_norm": NaN,
      "learning_rate": 6e-05,
      "loss": 1.5504,
      "num_input_tokens_seen": 19312,
      "step": 6
    },
    {
      "epoch": 0.016,
      "grad_norm": 0.9268227219581604,
      "learning_rate": 9.987820251299122e-05,
      "loss": 1.5661,
      "num_input_tokens_seen": 29232,
      "step": 9
    },
    {
      "epoch": 0.021333333333333333,
      "grad_norm": 1.1588999032974243,
      "learning_rate": 9.806308479691595e-05,
      "loss": 1.7033,
      "num_input_tokens_seen": 37984,
      "step": 12
    },
    {
      "epoch": 0.02666666666666667,
      "grad_norm": 1.0571134090423584,
      "learning_rate": 9.567727288213005e-05,
      "loss": 1.4225,
      "num_input_tokens_seen": 44592,
      "step": 15
    },
    {
      "epoch": 0.032,
      "grad_norm": 1.720107913017273,
      "learning_rate": 9.24024048078213e-05,
      "loss": 1.4217,
      "num_input_tokens_seen": 52400,
      "step": 18
    },
    {
      "epoch": 0.037333333333333336,
      "grad_norm": 0.930574893951416,
      "learning_rate": 8.596699001693255e-05,
      "loss": 1.2793,
      "num_input_tokens_seen": 60320,
      "step": 21
    },
    {
      "epoch": 0.042666666666666665,
      "grad_norm": 1.6979925632476807,
      "learning_rate": 7.795964517353735e-05,
      "loss": 1.4875,
      "num_input_tokens_seen": 67024,
      "step": 24
    },
    {
      "epoch": 0.048,
      "grad_norm": 2.2298834323883057,
      "learning_rate": 6.873032967079561e-05,
      "loss": 1.2446,
      "num_input_tokens_seen": 73776,
      "step": 27
    },
    {
      "epoch": 0.05333333333333334,
      "grad_norm": 1.6609553098678589,
      "learning_rate": 5.868240888334653e-05,
      "loss": 1.4691,
      "num_input_tokens_seen": 82592,
      "step": 30
    },
    {
      "epoch": 0.058666666666666666,
      "grad_norm": 1.1659108400344849,
      "learning_rate": 4.825502516487497e-05,
      "loss": 1.4451,
      "num_input_tokens_seen": 90512,
      "step": 33
    },
    {
      "epoch": 0.064,
      "grad_norm": 1.2636826038360596,
      "learning_rate": 3.790390522001662e-05,
      "loss": 1.4139,
      "num_input_tokens_seen": 96848,
      "step": 36
    },
    {
      "epoch": 0.06933333333333333,
      "grad_norm": 3.8678996562957764,
      "learning_rate": 2.8081442660546125e-05,
      "loss": 1.3205,
      "num_input_tokens_seen": 103728,
      "step": 39
    },
    {
      "epoch": 0.07466666666666667,
      "grad_norm": 0.6766985654830933,
      "learning_rate": 1.9216926233717085e-05,
      "loss": 1.2969,
      "num_input_tokens_seen": 112160,
      "step": 42
    },
    {
      "epoch": 0.08,
      "grad_norm": 0.7236246466636658,
      "learning_rate": 1.1697777844051105e-05,
      "loss": 1.5026,
      "num_input_tokens_seen": 117984,
      "step": 45
    },
    {
      "epoch": 0.08533333333333333,
      "grad_norm": 0.8583828806877136,
      "learning_rate": 5.852620357053651e-06,
      "loss": 1.3583,
      "num_input_tokens_seen": 126624,
      "step": 48
    },
    {
      "epoch": 0.08888888888888889,
      "num_input_tokens_seen": 132048,
      "step": 50,
      "total_flos": 5638623387844608.0,
      "train_loss": 1.4425424814224244,
      "train_runtime": 174.5036,
      "train_samples_per_second": 4.584,
      "train_steps_per_second": 0.287,
      "train_tokens_per_second": 1136.939
    }
  ],
  "logging_steps": 3,
  "max_steps": 50,
  "num_input_tokens_seen": 132048,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 5638623387844608.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
 }
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/training_args.bin
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/training_args.bin
--- a/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/training_loss.png
+++ b/results/lora_sft_Qwen-7B_1_gpu_50_step_20240918162245/training_loss.png