2021-10-22 01:57:19 +00:00
|
|
|
|
import re
|
|
|
|
|
|
2022-04-12 09:11:37 +00:00
|
|
|
|
# ══════ trim Chinese glosses ═══════════════════════
|
|
|
|
|
inputFile = "./source/OpenGNT_interlinear_CUVtc.csv"
|
|
|
|
|
outputFile = "./CUV-Glosses.txt"
|
|
|
|
|
|
|
|
|
|
f = open(inputFile,'r',encoding="utf-8")
|
|
|
|
|
newData = f.read()
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
searchReplace = (
|
|
|
|
|
('{\d+#',''),
|
|
|
|
|
('|\d+}',''),
|
|
|
|
|
('〈\d+*','⸂'),
|
|
|
|
|
('|\d+〉','⸃'),
|
|
|
|
|
(';',''),
|
|
|
|
|
(',',''),
|
|
|
|
|
('。',''),
|
|
|
|
|
('.',''),
|
|
|
|
|
(',',''),
|
|
|
|
|
('〔.*?〕',''),
|
|
|
|
|
('【.*?】',''),
|
|
|
|
|
('\[.*?\]',''),
|
|
|
|
|
('⸃⸂',''),
|
|
|
|
|
('\(bcc.*?\)',''),
|
|
|
|
|
('\(.*?\)',''),
|
|
|
|
|
('\(\*bcc.*?\)',''),
|
|
|
|
|
('\(=bcc.*?\)',''),
|
|
|
|
|
('(\*bcc.*?)',''),
|
|
|
|
|
(' ',''),
|
|
|
|
|
('\*','⸂'),
|
|
|
|
|
('\t⸂','\t'),
|
|
|
|
|
('⸃\n','\n'),
|
|
|
|
|
('\d+\t\n',''),
|
|
|
|
|
(' ','')
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for search, replace in searchReplace:
|
|
|
|
|
newData = re.sub (search, replace, newData)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
f = open(outputFile,'w',encoding="utf_8_sig")
|
|
|
|
|
f.write(newData)
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
# ══════ trim Chinese glosses ═══════════════════════
|
|
|
|
|
|
|
|
|
|
glosses = "./CUV-Glosses.txt"
|
2021-12-09 01:29:52 +00:00
|
|
|
|
inputFile = "./OGNTa.txt"
|
|
|
|
|
outputFile = "./OGNTa-TC.txt"
|
2021-10-22 01:57:19 +00:00
|
|
|
|
|
|
|
|
|
# loading glosses into dictionary
|
|
|
|
|
gloss_dict = {}
|
2021-12-09 01:29:52 +00:00
|
|
|
|
gloss_file = open(glosses,'r',encoding="utf_8_sig")
|
2021-10-22 01:57:19 +00:00
|
|
|
|
for Line in gloss_file:
|
2021-11-23 22:41:33 +00:00
|
|
|
|
# print(Line)
|
2021-10-22 01:57:19 +00:00
|
|
|
|
key, value = Line.split()
|
|
|
|
|
gloss_dict [key] = value
|
|
|
|
|
|
|
|
|
|
gloss_file.close()
|
|
|
|
|
|
|
|
|
|
# loading OGNTa
|
|
|
|
|
|
2021-12-09 01:29:52 +00:00
|
|
|
|
f = open(inputFile,'r',encoding="utf_8_sig")
|
2021-10-22 01:57:19 +00:00
|
|
|
|
Lines = f.readlines()
|
2021-11-08 01:30:42 +00:00
|
|
|
|
newData = f.read()
|
2021-10-22 01:57:19 +00:00
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
# processing
|
2021-12-09 01:29:52 +00:00
|
|
|
|
f = open(outputFile,'w',encoding='utf_8_sig')
|
2021-10-22 01:57:19 +00:00
|
|
|
|
|
|
|
|
|
for ol in Lines:
|
|
|
|
|
x = re.split ("\t", ol)
|
2022-04-12 13:25:24 +00:00
|
|
|
|
if(gloss_dict.get(x[0])): x[5] = str(gloss_dict.get(x[0]))
|
|
|
|
|
else: x[5] = '-'
|
2022-04-12 09:11:37 +00:00
|
|
|
|
f.write(x[0]+"\t"+x[1]+"\t"+x[2]+"\t"+x[3]+"\t"+x[4]+"\t"+x[5]+"\n")
|
2021-10-22 01:57:19 +00:00
|
|
|
|
|
|
|
|
|
|
2021-11-08 01:30:42 +00:00
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------
|
2021-12-09 01:29:52 +00:00
|
|
|
|
f = open(outputFile,'r',encoding="utf_8_sig")
|
2021-11-08 01:30:42 +00:00
|
|
|
|
newData = f.read()
|
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
# Update Book Name Abbreviation
|
|
|
|
|
searchReplace = (
|
|
|
|
|
('Mat ', '太 '),
|
|
|
|
|
('Mar ', '可 '),
|
|
|
|
|
('Luk ', '路 '),
|
|
|
|
|
('Joh ', '約 '),
|
|
|
|
|
('Act ', '徒 '),
|
|
|
|
|
('Rom ', '羅 '),
|
|
|
|
|
('1Co ', '林前 '),
|
|
|
|
|
('2Co ', '林後 '),
|
|
|
|
|
('Gal ', '加 '),
|
|
|
|
|
('Eph ', '弗 '),
|
2021-11-17 01:35:16 +00:00
|
|
|
|
('Php ', '腓 '),
|
2021-11-08 01:30:42 +00:00
|
|
|
|
('Col ', '西 '),
|
|
|
|
|
('1Th ', '帖前 '),
|
|
|
|
|
('2Th ', '帖後 '),
|
|
|
|
|
('1Ti ', '提前 '),
|
|
|
|
|
('2Ti ', '提後 '),
|
|
|
|
|
('Tit ', '多 '),
|
|
|
|
|
('Phm ', '門 '),
|
|
|
|
|
('Heb ', '來 '),
|
|
|
|
|
('Jas ', '雅 '),
|
|
|
|
|
('1Pe ', '彼前 '),
|
|
|
|
|
('2Pe ', '彼後 '),
|
|
|
|
|
('1Jo ', '約一 '),
|
|
|
|
|
('2Jo ', '約二 '),
|
|
|
|
|
('3Jo ', '約三 '),
|
|
|
|
|
('Jud ', '猶 '),
|
|
|
|
|
('Rev ', '啟 ')
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for search, replace in searchReplace:
|
|
|
|
|
newData = re.sub(search, replace, newData)
|
|
|
|
|
|
2021-12-09 01:29:52 +00:00
|
|
|
|
f = open(outputFile,'w',encoding='utf_8_sig')
|
2021-11-08 01:30:42 +00:00
|
|
|
|
f.write (newData)
|
2021-10-22 01:57:19 +00:00
|
|
|
|
f.close()
|