From b3d77529d096b3dc51d406ebea13bc2d1efa2eb7 Mon Sep 17 00:00:00 2001 From: asdf Date: Sat, 22 Jul 2023 13:51:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=96=B0=E7=9A=84=E6=A8=A1?= =?UTF-8?q?=E5=9D=97=E5=BC=95=E5=AF=BC=EF=BC=8C=E6=94=AF=E6=8C=81=E6=8F=90?= =?UTF-8?q?=E4=BA=A4=E8=AF=B4=E6=98=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package/test/text.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 package/test/text.py diff --git a/package/test/text.py b/package/test/text.py new file mode 100644 index 000000000..3c8623ec6 --- /dev/null +++ b/package/test/text.py @@ -0,0 +1,22 @@ +import re + +def process_text(file_name, char_length, prompt): + with open(file_name, 'r', encoding='utf-8') as f: + content = f.readlines() + + new_content = [] + for line in content: + if not re.match(r"(\d+|\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+)", line.strip()): + new_content.append(line.strip()) + + text = "".join(new_content) + segments = [text[i:i+char_length] for i in range(0, len(text), char_length)] + + with open('processed_' + file_name, 'w', encoding='utf-8') as f: + for i, segment in enumerate(segments): + f.write(f"{prompt}: {segment}\r\n\r\n\r\n") + +# 使用脚本 +prompt = "整理后面我发给你的培训录音,去掉语气词,将其改写为适合书面发表的语言,并适当增加小标题,用markdown的格式输出" + +process_text('Section1.txt', 4000, prompt)