From e5719d59ea2d57311a7401d71fcac2185d33de92 Mon Sep 17 00:00:00 2001 From: Eliran Wong Date: Mon, 1 Oct 2018 02:59:40 +0100 Subject: [PATCH] Add files via upload --- Script/exportTANTT_M_morphology.py | 100 +++++++++++++++++++++++ Script/exportTANTT_M_text.py | 122 +++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 Script/exportTANTT_M_morphology.py create mode 100644 Script/exportTANTT_M_text.py diff --git a/Script/exportTANTT_M_morphology.py b/Script/exportTANTT_M_morphology.py new file mode 100644 index 0000000..0d21998 --- /dev/null +++ b/Script/exportTANTT_M_morphology.py @@ -0,0 +1,100 @@ +# rename TANTT database to 'TANTT.csv' +# put 'TANTT.csv' and this script in the same folder +# locate the folder in terminal +# enter command in terminal 'python exportTANTT_M_morphology.py' + +import re + +inputFile = 'TANTT.csv' +outputFile = 'TANTT_M_morphology.csv' + +# export latest glosses + +f = open(inputFile,'r') +newData = f.read() +f.close() + +# clean up +newData = re.sub('\n[\n]+?([^\n])', r'\n\1', newData, flags=re.M) +newData = re.sub(' [ ]+?([^ ])', r' \1', newData, flags=re.M) +newData = re.sub('^ ', '', newData, flags=re.M) +newData = re.sub(' \t', '\t', newData, flags=re.M) +newData = re.sub('\t ', '\t', newData, flags=re.M) +newData = re.sub('\A[\d\D]*?\n41_Mat', '41_Mat', newData, flags=re.M) + +# mark punctuations +newData = re.sub('^([^\n\t]*?\t[^\n\t]*?\t\t)', r'%\1', newData, flags=re.M) + +# remove punctuations +newData = re.sub('^.*?%.*?\n', '', newData, flags=re.M) + +# mark M variants +newData = re.sub('^([^\n\t]*?\t[^\n\t]*?M)', r'*\1', newData, flags=re.M) +newData = re.sub('^(.*?\t[^\n\t]*?M[^\n\t\+;]*?=[^\n\t]*?\t)$', r'@\1', newData, flags=re.M) +newData = re.sub('^[^@*].*?\n', '', newData, flags=re.M) +newData = re.sub('^@(.*?\t).*?[^\n\t]*?M[^\n\t\+;]*?=[<>].*?\n', '', newData, flags=re.M) + +# export M variants +newData = re.sub('^(.*?\t)(.*?)\t.*?\t(.*?)\t(.*?)\t(.*?)\t.*?\t.*?\t.*?\t', r'\1\2=\3=\4=\5\t', newData, flags=re.M) +newData = re.sub('\t$', '', newData, flags=re.M) +newData = re.sub('^(@.*?\t)(.*?)(\t.*?)([A-Za-z]*?M[A-Za-z]*?=[^\n; ]*?);', r'\1\4\3\2;', newData, flags=re.M) +newData = re.sub('^[@*]', '', newData, flags=re.M) + +# BibleBento format + +# book no +newData = re.sub('^41_', '40 ', newData, flags=re.M) +newData = re.sub('^42_', '41 ', newData, flags=re.M) +newData = re.sub('^43_', '42 ', newData, flags=re.M) +newData = re.sub('^44_', '43 ', newData, flags=re.M) +newData = re.sub('^45_', '44 ', newData, flags=re.M) +newData = re.sub('^46_', '45 ', newData, flags=re.M) +newData = re.sub('^47_', '46 ', newData, flags=re.M) +newData = re.sub('^48_', '47 ', newData, flags=re.M) +newData = re.sub('^49_', '48 ', newData, flags=re.M) +newData = re.sub('^50_', '49 ', newData, flags=re.M) +newData = re.sub('^51_', '50 ', newData, flags=re.M) +newData = re.sub('^52_', '51 ', newData, flags=re.M) +newData = re.sub('^53_', '52 ', newData, flags=re.M) +newData = re.sub('^54_', '53 ', newData, flags=re.M) +newData = re.sub('^55_', '54 ', newData, flags=re.M) +newData = re.sub('^56_', '55 ', newData, flags=re.M) +newData = re.sub('^57_', '56 ', newData, flags=re.M) +newData = re.sub('^58_', '57 ', newData, flags=re.M) +newData = re.sub('^59_', '58 ', newData, flags=re.M) +newData = re.sub('^60_', '59 ', newData, flags=re.M) +newData = re.sub('^61_', '60 ', newData, flags=re.M) +newData = re.sub('^62_', '61 ', newData, flags=re.M) +newData = re.sub('^63_', '62 ', newData, flags=re.M) +newData = re.sub('^64_', '63 ', newData, flags=re.M) +newData = re.sub('^65_', '64 ', newData, flags=re.M) +newData = re.sub('^66_', '65 ', newData, flags=re.M) +newData = re.sub('^67_', '66 ', newData, flags=re.M) + +# chapter no +newData = re.sub('\t...\.([0-9]+?)\.([0-9]+?)\t', r'\t\1\t\2\t', newData) +newData = re.sub('\t00', '\t', newData) +newData = re.sub('\t0', '\t', newData) + +# Greek unicode characters +newData = re.sub('[άά]', 'ά', newData) +newData = re.sub('[ίί]', 'ί', newData) +newData = re.sub('[έέ]', 'έ', newData) +newData = re.sub('[ώώ]', 'ώ', newData) +newData = re.sub('[ήή]', 'ή', newData) +newData = re.sub('[ύύ]', 'ύ', newData) +newData = re.sub('[όό]', 'ό', newData) +newData = re.sub('̓͂Α', 'Ἆ', newData) +newData = re.sub('̓͂Η', 'Ἦ', newData) +newData = re.sub('̓͂Ω', 'Ὦ', newData) +newData = re.sub('ί̈', 'ΐ', newData) +newData = re.sub('[ΐΐ]', 'ΐ', newData) +newData = re.sub('[ΰΰ]', 'ΰ', newData) +newData = re.sub('[᾿ʼ]', '᾽', newData) +newData = re.sub('ῇ', 'ῇ', newData) +newData = re.sub('ῇ', 'ῇ', newData) + +# save +f = open(outputFile,'w') +f.write(newData) +f.close() diff --git a/Script/exportTANTT_M_text.py b/Script/exportTANTT_M_text.py new file mode 100644 index 0000000..65ef60f --- /dev/null +++ b/Script/exportTANTT_M_text.py @@ -0,0 +1,122 @@ +# rename TANTT database to 'TANTT.csv' +# put 'TANTT.csv' and this script in the same folder +# locate the folder in terminal +# enter command in terminal 'python exportTANTT_M_text.py' + +import re + +inputFile = 'TANTT.csv' +outputFile = 'TANTT_M_text.csv' + +# export latest glosses + +f = open(inputFile,'r') +newData = f.read() +f.close() + +# clean up +newData = re.sub('\n[\n]+?([^\n])', r'\n\1', newData, flags=re.M) +newData = re.sub(' [ ]+?([^ ])', r' \1', newData, flags=re.M) +newData = re.sub('^ ', '', newData, flags=re.M) +newData = re.sub(' \t', '\t', newData, flags=re.M) +newData = re.sub('\t ', '\t', newData, flags=re.M) +newData = re.sub('\A[\d\D]*?\n41_Mat', '41_Mat', newData, flags=re.M) + +# mark punctuations +newData = re.sub('^([^\n\t]*?\t[^\n\t]*?\t\t)', r'%\1', newData, flags=re.M) + +# mark M variants +newData = re.sub('^([^\n\t]*?\t[^\n\t]*?M)', r'*\1', newData, flags=re.M) +newData = re.sub('^(.*?\t[^\n\t]*?M[^\n\t\+;]*?=[^\n\t]*?\t)$', r'@\1', newData, flags=re.M) +newData = re.sub('^[^@*].*?\n', '', newData, flags=re.M) + +# export M variants +newData = re.sub('^*(.*?\t).*?\t.*?\t(.*?)\t.*?$', r'\1\2', newData, flags=re.M) +newData = re.sub('^@(.*?\t).*?[^\n\t]*?M[^\n\t\+;]*?=[<>].*?\n', '', newData, flags=re.M) +newData = re.sub('^@(.*?\t).*?[^\n\t]*?M[^\n\t\+;]*?=([^\n\t=]*?)=[^\n\t]*?\t$', r'\1\2', newData, flags=re.M) + +# tag punctuations +newData = re.sub('^%(.*?\t)(.*?)$', r'\1\2', newData, flags=re.M) + +# BibleBento format + +# book no +newData = re.sub('^41_', '40 ', newData, flags=re.M) +newData = re.sub('^42_', '41 ', newData, flags=re.M) +newData = re.sub('^43_', '42 ', newData, flags=re.M) +newData = re.sub('^44_', '43 ', newData, flags=re.M) +newData = re.sub('^45_', '44 ', newData, flags=re.M) +newData = re.sub('^46_', '45 ', newData, flags=re.M) +newData = re.sub('^47_', '46 ', newData, flags=re.M) +newData = re.sub('^48_', '47 ', newData, flags=re.M) +newData = re.sub('^49_', '48 ', newData, flags=re.M) +newData = re.sub('^50_', '49 ', newData, flags=re.M) +newData = re.sub('^51_', '50 ', newData, flags=re.M) +newData = re.sub('^52_', '51 ', newData, flags=re.M) +newData = re.sub('^53_', '52 ', newData, flags=re.M) +newData = re.sub('^54_', '53 ', newData, flags=re.M) +newData = re.sub('^55_', '54 ', newData, flags=re.M) +newData = re.sub('^56_', '55 ', newData, flags=re.M) +newData = re.sub('^57_', '56 ', newData, flags=re.M) +newData = re.sub('^58_', '57 ', newData, flags=re.M) +newData = re.sub('^59_', '58 ', newData, flags=re.M) +newData = re.sub('^60_', '59 ', newData, flags=re.M) +newData = re.sub('^61_', '60 ', newData, flags=re.M) +newData = re.sub('^62_', '61 ', newData, flags=re.M) +newData = re.sub('^63_', '62 ', newData, flags=re.M) +newData = re.sub('^64_', '63 ', newData, flags=re.M) +newData = re.sub('^65_', '64 ', newData, flags=re.M) +newData = re.sub('^66_', '65 ', newData, flags=re.M) +newData = re.sub('^67_', '66 ', newData, flags=re.M) + +# chapter no +newData = re.sub('\t...\.([0-9]+?)\.([0-9]+?)\t', r'\t\1\t\2\t', newData) +newData = re.sub('\t00', '\t', newData) +newData = re.sub('\t0', '\t', newData) + +# Greek unicode characters +newData = re.sub('[άά]', 'ά', newData) +newData = re.sub('[ίί]', 'ί', newData) +newData = re.sub('[έέ]', 'έ', newData) +newData = re.sub('[ώώ]', 'ώ', newData) +newData = re.sub('[ήή]', 'ή', newData) +newData = re.sub('[ύύ]', 'ύ', newData) +newData = re.sub('[όό]', 'ό', newData) +newData = re.sub('̓͂Α', 'Ἆ', newData) +newData = re.sub('̓͂Η', 'Ἦ', newData) +newData = re.sub('̓͂Ω', 'Ὦ', newData) +newData = re.sub('ί̈', 'ΐ', newData) +newData = re.sub('[ΐΐ]', 'ΐ', newData) +newData = re.sub('[ΰΰ]', 'ΰ', newData) +newData = re.sub('[᾿ʼ]', '᾽', newData) +newData = re.sub('ῇ', 'ῇ', newData) +newData = re.sub('ῇ', 'ῇ', newData) + +# punctuations +newData = re.sub('¶', '¶ ', newData) +newData = re.sub('¬', '‡', newData) + +# format each verse on a single line +p = re.compile(r'^([0-9]+?\t[0-9]+?\t[0-9]+?\t)(.*?)\n\1', flags=re.M) +s = p.search(newData) +while s: + newData = p.sub(r'\1\2 ', newData) + s = p.search(newData) +newData = re.sub(' ', '', newData) + +# add order number +f = open(outputFile,'w') +newData = newData.split('\n') +order = 1 +for line in newData: + if line != '': + orderStr = str(order) + newLine = orderStr + '\t' + line + '\n' + f.write(newLine) + order = order + 1 +f.close() + +# do NOT add order number +# f = open(outputFile,'w') +# f.write(newData) +# f.close()