CPM-9G-8B/FM_9G/fm9g/dataset/tools/jsonl_to_index.py

32 lines
820 B
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright @2023 AI, ZHIHU Inc. (zhihu.com)
#
# @author: ouzebin <ouzebin@zhihu.com>
# @date: 2023/08/07
import argparse
import os
def build_index(path):
data_path = os.path.join(path, "data.jsonl")
assert os.path.exists(data_path), f"Jsonline dataset '{data_path}' not found."
offset = 0
starts = [offset]
with open(data_path, "rb") as fin:
for line in fin:
offset += len(line)
starts.append(offset)
with open(os.path.join(path, "index"), "w") as fout:
for s in starts:
fout.write(f"{s}\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--path", "-p", required=True, help="Data path.")
args = parser.parse_args()
build_index(args.path)