Andley_OGNTa/Scripts/Step1-Trim OGNTa-TC.py

123 lines
2.5 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
# ══════ trim Chinese glosses ═══════════════════════
inputFile = "./source/OpenGNT_interlinear_CUVtc.csv"
outputFile = "./CUV-Glosses.txt"
f = open(inputFile,'r',encoding="utf-8")
newData = f.read()
f.close()
searchReplace = (
('\d+',''),
('\d+',''),
('\d+',''),
('\d+〉',''),
('',''),
('',''),
('',''),
('',''),
(',',''),
('.*?',''),
('【.*?】',''),
('\[.*?\]',''),
('⸃⸂',''),
('\(bcc.*?\)',''),
('\(.*?\)',''),
('\(\*bcc.*?\)',''),
('\(=bcc.*?\)',''),
('\*bcc.*?',''),
(' ',''),
('\*',''),
('\t','\t'),
('\n','\n'),
('\d+\t\n',''),
(' ','')
)
for search, replace in searchReplace:
newData = re.sub (search, replace, newData)
f = open(outputFile,'w',encoding="utf_8_sig")
f.write(newData)
f.close()
# ══════ trim Chinese glosses ═══════════════════════
glosses = "./CUV-Glosses.txt"
inputFile = "./OGNTa.txt"
outputFile = "./OGNTa-TC.txt"
# loading glosses into dictionary
gloss_dict = {}
gloss_file = open(glosses,'r',encoding="utf_8_sig")
for Line in gloss_file:
# print(Line)
key, value = Line.split()
gloss_dict [key] = value
gloss_file.close()
# loading OGNTa
f = open(inputFile,'r',encoding="utf_8_sig")
Lines = f.readlines()
newData = f.read()
f.close()
# processing
f = open(outputFile,'w',encoding='utf_8_sig')
for ol in Lines:
x = re.split ("\t", ol)
if(gloss_dict.get(x[0])): x[5] = str(gloss_dict.get(x[0]))
else: x[5] = '-'
f.write(x[0]+"\t"+x[1]+"\t"+x[2]+"\t"+x[3]+"\t"+x[4]+"\t"+x[5]+"\n")
f.close()
# ---------------------------------------------------
f = open(outputFile,'r',encoding="utf_8_sig")
newData = f.read()
f.close()
# Update Book Name Abbreviation
searchReplace = (
('Mat ', ''),
('Mar ', ''),
('Luk ', ''),
('Joh ', ''),
('Act ', ''),
('Rom ', ''),
('1Co ', '林前 '),
('2Co ', '林後 '),
('Gal ', ''),
('Eph ', ''),
('Php ', ''),
('Col ', '西 '),
('1Th ', '帖前 '),
('2Th ', '帖後 '),
('1Ti ', '提前 '),
('2Ti ', '提後 '),
('Tit ', ''),
('Phm ', ''),
('Heb ', ''),
('Jas ', ''),
('1Pe ', '彼前 '),
('2Pe ', '彼後 '),
('1Jo ', '約一 '),
('2Jo ', '約二 '),
('3Jo ', '約三 '),
('Jud ', ''),
('Rev ', '')
)
for search, replace in searchReplace:
newData = re.sub(search, replace, newData)
f = open(outputFile,'w',encoding='utf_8_sig')
f.write (newData)
f.close()