diff --git a/tests/auto_gptq.py b/tests/auto_gptq.py
new file mode 100644
index 00000000..3fd2ab12
--- /dev/null
+++ b/tests/auto_gptq.py
@@ -0,0 +1,47 @@
+# coding=utf-8
+# Quantizes fine-tuned models with AutoGPTQ (https://github.com/PanQiWei/AutoGPTQ).
+# Usage: python auto_gptq.py --input_dir path_to_llama_model --output_dir path_to_quant_model --data_file alpaca.json
+# dataset format: question (string), A (string), B (string), C (string), D (string), answer (Literal["A", "B", "C", "D"])
+
+
+import fire
+from datasets import load_dataset
+from transformers import AutoTokenizer
+from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
+
+
+def quantize(input_dir: str, output_dir: str, data_file: str):
+    tokenizer = AutoTokenizer.from_pretrained(input_dir, use_fast=False, padding_side="left")
+
+    def format_example(examples):
+        prefix=("A chat between a curious user and an artificial intelligence assistant. "
+                "The assistant gives helpful, detailed, and polite answers to the user's questions.")
+        texts = []
+        for i in range(len(examples["instruction"])):
+            prompt = prefix + "\n"
+            if "history" in examples:
+                for user_query, bot_resp in examples["history"][i]:
+                    prompt += "Human: {}\nAssistant: {}\n".format(user_query, bot_resp)
+            prompt += "Human: {}\nAssistant: {}".format(examples["instruction"][i], examples["output"][i])
+            texts.append(prompt)
+        return tokenizer(texts, truncation=True, max_length=1024)
+
+    dataset = load_dataset("json", data_files=data_file)["train"]
+    column_names = list(dataset.column_names)
+    dataset = dataset.select(range(1024))
+    dataset = dataset.map(format_example, batched=True, remove_columns=column_names)
+    dataset = dataset.shuffle()
+
+    quantize_config = BaseQuantizeConfig(
+        bits=4,
+        group_size=128,
+        desc_act=False
+    )
+
+    model = AutoGPTQForCausalLM.from_pretrained(input_dir, quantize_config)
+    model.quantize(dataset)
+    model.save_quantized(output_dir)
+
+
+if __name__ == "__main__":
+    fire.Fire(quantize)
diff --git a/evaluation/evaluate_zh.py b/tests/evaluate_zh.py
similarity index 93%
rename from evaluation/evaluate_zh.py
rename to tests/evaluate_zh.py
index e2bc67c2..b079cf7d 100644
--- a/evaluation/evaluate_zh.py
+++ b/tests/evaluate_zh.py
@@ -1,6 +1,7 @@
 # coding=utf-8
 # Evaluates fine-tuned models automatically.
-# Usage: python evaluate.py --evalset ceval/ceval-exam:law --split dev --api_base http://localhost:8000/v1 --task_type choice
+# Usage: python evaluate_zh.py --evalset ceval/ceval-exam:law --split dev --output_file result.json
+#                              --api_base http://localhost:8000/v1 --task_type choice --n_samples 100
 # dataset format: question (string), A (string), B (string), C (string), D (string), answer (Literal["A", "B", "C", "D"])
 
 
@@ -75,6 +76,7 @@ EXT2TYPE = {
 def evaluate(
         evalset: str,
         api_base: str,
+        output_file: str,
         split: Optional[str] = "val",
         task_type: Optional[Literal["choice", "cloze", "openqa"]] = "choice",
         n_samples: Optional[int] = 20
@@ -122,7 +124,8 @@ def evaluate(
         })
 
     print("Result: {}/{}\nAccuracy: {:.2f}%".format(n_correct, n_samples, n_correct / n_samples * 100))
-    with open("result.json", "w", encoding="utf-8") as f:
+
+    with open(output_file, "w", encoding="utf-8") as f:
         json.dump(predictions, f, indent=2, ensure_ascii=False)