KAG/kag/examples/csqa/generate_data.py

49 lines
1.6 KiB
Python

def to_snake_case(name):
import re
words = re.findall("SQL|VBA|[A-Za-z][a-z0-9]*", name)
result = "_".join(words).lower()
return result
def main():
import io
import os
import json
dir_path = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(dir_path, "cs.jsonl")
if not os.path.isfile(file_path):
print(
"Please download cs.jsonl from https://huggingface.co/datasets/TommyChien/UltraDomain/tree/main"
)
return
with io.open(file_path, "r", encoding="utf-8", newline="\n") as fin:
questions = []
documents = set()
for line in fin:
item = json.loads(line)
title = item["meta"]["title"]
content = title + "\n" + item["context"]
if content not in documents:
name = to_snake_case(title)
output_file_path = os.path.join(
dir_path, "builder", "data", name + ".txt"
)
with io.open(
output_file_path, "w", encoding="utf-8", newline="\n"
) as fout:
print(content, file=fout)
item["context"] = title
questions.append(item)
output_file_path = os.path.join(dir_path, "solver", "data", "questions.json")
with io.open(output_file_path, "w", encoding="utf-8", newline="\n") as fout:
json.dump(
questions, fout, separators=(",", ": "), indent=4, ensure_ascii=False
)
print(file=fout)
if __name__ == "__main__":
main()