forked from jiuyuan/CPM-9G-8B
25 lines
666 B
Python
25 lines
666 B
Python
|
import argparse
|
||
|
import os
|
||
|
|
||
|
|
||
|
def build_index(path):
|
||
|
data_path = os.path.join(path, "data.jsonl")
|
||
|
assert os.path.exists(data_path), f"Jsonline dataset '{data_path}' not found."
|
||
|
|
||
|
offset = 0
|
||
|
starts = [offset]
|
||
|
with open(data_path, "rb") as fin:
|
||
|
for line in fin:
|
||
|
offset += len(line)
|
||
|
starts.append(offset)
|
||
|
with open(os.path.join(path, "index"), "w") as fout:
|
||
|
for s in starts:
|
||
|
fout.write(f"{s}\n")
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
parser = argparse.ArgumentParser()
|
||
|
parser.add_argument("--path", "-p", required=True, help="Data path.")
|
||
|
args = parser.parse_args()
|
||
|
build_index(args.path)
|