Andley_OGNTa/Scripts/Step1-Trim OGNTa-TC.py

123 lines
2.5 KiB
Python
Raw Normal View History

2021-10-22 01:57:19 +00:00
import re
2022-04-12 09:11:37 +00:00
# ══════ trim Chinese glosses ═══════════════════════
inputFile = "./source/OpenGNT_interlinear_CUVtc.csv"
outputFile = "./CUV-Glosses.txt"
f = open(inputFile,'r',encoding="utf-8")
newData = f.read()
f.close()
searchReplace = (
('\d+',''),
('\d+',''),
('\d+',''),
('\d+〉',''),
('',''),
('',''),
('',''),
('',''),
(',',''),
('.*?',''),
('【.*?】',''),
('\[.*?\]',''),
('⸃⸂',''),
('\(bcc.*?\)',''),
('\(.*?\)',''),
('\(\*bcc.*?\)',''),
('\(=bcc.*?\)',''),
('\*bcc.*?',''),
(' ',''),
('\*',''),
('\t','\t'),
('\n','\n'),
('\d+\t\n',''),
(' ','')
)
for search, replace in searchReplace:
newData = re.sub (search, replace, newData)
f = open(outputFile,'w',encoding="utf_8_sig")
f.write(newData)
f.close()
# ══════ trim Chinese glosses ═══════════════════════
glosses = "./CUV-Glosses.txt"
2021-12-09 01:29:52 +00:00
inputFile = "./OGNTa.txt"
outputFile = "./OGNTa-TC.txt"
2021-10-22 01:57:19 +00:00
# loading glosses into dictionary
gloss_dict = {}
2021-12-09 01:29:52 +00:00
gloss_file = open(glosses,'r',encoding="utf_8_sig")
2021-10-22 01:57:19 +00:00
for Line in gloss_file:
2021-11-23 22:41:33 +00:00
# print(Line)
2021-10-22 01:57:19 +00:00
key, value = Line.split()
gloss_dict [key] = value
gloss_file.close()
# loading OGNTa
2021-12-09 01:29:52 +00:00
f = open(inputFile,'r',encoding="utf_8_sig")
2021-10-22 01:57:19 +00:00
Lines = f.readlines()
2021-11-08 01:30:42 +00:00
newData = f.read()
2021-10-22 01:57:19 +00:00
f.close()
# processing
2021-12-09 01:29:52 +00:00
f = open(outputFile,'w',encoding='utf_8_sig')
2021-10-22 01:57:19 +00:00
for ol in Lines:
x = re.split ("\t", ol)
2022-04-12 13:25:24 +00:00
if(gloss_dict.get(x[0])): x[5] = str(gloss_dict.get(x[0]))
else: x[5] = '-'
2022-04-12 09:11:37 +00:00
f.write(x[0]+"\t"+x[1]+"\t"+x[2]+"\t"+x[3]+"\t"+x[4]+"\t"+x[5]+"\n")
2021-10-22 01:57:19 +00:00
2021-11-08 01:30:42 +00:00
f.close()
# ---------------------------------------------------
2021-12-09 01:29:52 +00:00
f = open(outputFile,'r',encoding="utf_8_sig")
2021-11-08 01:30:42 +00:00
newData = f.read()
f.close()
# Update Book Name Abbreviation
searchReplace = (
('Mat ', ''),
('Mar ', ''),
('Luk ', ''),
('Joh ', ''),
('Act ', ''),
('Rom ', ''),
('1Co ', '林前 '),
('2Co ', '林後 '),
('Gal ', ''),
('Eph ', ''),
2021-11-17 01:35:16 +00:00
('Php ', ''),
2021-11-08 01:30:42 +00:00
('Col ', '西 '),
('1Th ', '帖前 '),
('2Th ', '帖後 '),
('1Ti ', '提前 '),
('2Ti ', '提後 '),
('Tit ', ''),
('Phm ', ''),
('Heb ', ''),
('Jas ', ''),
('1Pe ', '彼前 '),
('2Pe ', '彼後 '),
('1Jo ', '約一 '),
('2Jo ', '約二 '),
('3Jo ', '約三 '),
('Jud ', ''),
('Rev ', '')
)
for search, replace in searchReplace:
newData = re.sub(search, replace, newData)
2021-12-09 01:29:52 +00:00
f = open(outputFile,'w',encoding='utf_8_sig')
2021-11-08 01:30:42 +00:00
f.write (newData)
2021-10-22 01:57:19 +00:00
f.close()