# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import datasets import pandas as pd _CITATION = """\ @article{li2023cmmlu, title={CMMLU: Measuring massive multitask language understanding in Chinese}, author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin}, journal={arXiv preprint arXiv:2306.09212}, year={2023} } """ _DESCRIPTION = """\ CMMLU is a comprehensive Chinese assessment suite specifically designed to evaluate the advanced knowledge and reasoning abilities of LLMs within the Chinese language and cultural context. """ _HOMEPAGE = "https://github.com/haonan-li/CMMLU" _LICENSE = "Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License" _URL = "cmmlu.zip" task_list = [ 'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam', 'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature', 'chinese_teacher_qualification', 'clinical_knowledge', 'college_actuarial_science', 'college_education', 'college_engineering_hydrology', 'college_law', 'college_mathematics', 'college_medical_statistics', 'college_medicine', 'computer_science', 'computer_security', 'conceptual_physics', 'construction_project_management', 'economics', 'education', 'electrical_engineering', 'elementary_chinese', 'elementary_commonsense', 'elementary_information_and_technology', 'elementary_mathematics', 'ethnology', 'food_science', 'genetics', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_geography', 'high_school_mathematics', 'high_school_physics', 'high_school_politics', 'human_sexuality', 'international_law', 'journalism', 'jurisprudence', 'legal_and_moral_basis', 'logical', 'machine_learning', 'management', 'marketing', 'marxist_theory', 'modern_chinese', 'nutrition', 'philosophy', 'professional_accounting', 'professional_law', 'professional_medicine', 'professional_psychology', 'public_relations', 'security_study', 'sociology', 'sports_science', 'traditional_chinese_medicine', 'virology', 'world_history', 'world_religions', ] class CMMLUConfig(datasets.BuilderConfig): def __init__(self, **kwargs): super().__init__(version=datasets.Version("1.0.1"), **kwargs) class CMMLU(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [ CMMLUConfig( name=task_name, ) for task_name in task_list ] def _info(self): features = datasets.Features( { "question": datasets.Value("string"), "A": datasets.Value("string"), "B": datasets.Value("string"), "C": datasets.Value("string"), "D": datasets.Value("string"), "answer": datasets.Value("string"), } ) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, ) def _split_generators(self, dl_manager): data_dir = dl_manager.download_and_extract(_URL) task_name = self.config.name return [ datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={ "filepath": os.path.join(data_dir, f"test/{task_name}.csv"), }, ), datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "filepath": os.path.join(data_dir, f"dev/{task_name}.csv"), }, ), ] def _generate_examples(self, filepath): df = pd.read_csv(filepath, header=0, index_col=0, encoding="utf-8") for i, instance in enumerate(df.to_dict(orient="records")): question = instance.pop("Question", "") answer = instance.pop("Answer", "") instance["question"] = question instance["answer"] = answer yield i, instance