richmahn_GEN_MSWord_notes/convert.py

390 lines
18 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
#
# TN_MSWrd_to_TSV9_via_Proskomma.py
#
# Copyright (c) 2021 unfoldingWord
# http://creativecommons.org/licenses/MIT/
# See LICENSE file for details.
#
# Contributors:
# Robert Hunt <Robert.Hunt@unfoldingword.org>
#
# Written Sept 2021 by RJH
# Last modified: 2021-09-21 by RJH
#
"""
Quick script to create 9-column TN files from MS-Word files.
NOTE: This requires the addition of the OrigQuote column!
"""
from typing import List, Tuple
import sys
import os
import csv
from pathlib import Path
import random
import re
import logging
import subprocess
from collections import OrderedDict
import urllib.request
from usfm_utils import unalign_usfm
from tx_usfm_tools.singleFilelessHtmlRenderer import SingleFilelessHtmlRenderer
from bs4 import BeautifulSoup
import difflib
LOCAL_SOURCE_FOLDERPATH = 'txt'
# The output folder below must also already exist!
LOCAL_OUTPUT_FOLDERPATH = 'tsv'
BBB_NUMBER_DICT = {'GEN':'01','EXO':'02','LEV':'03','NUM':'04','DEU':'05',
'JOS':'06','JDG':'07','RUT':'08','1SA':'09','2SA':'10','1KI':'11',
'2KI':'12','1CH':'13','2CH':'14','EZR':'15',
'NEH':'16',
'EST':'17',
'JOB':'18','PSA':'19','PRO':'20','ECC':'21','SNG':'22','ISA':'23',
'JER':'24','LAM':'25','EZK':'26','DAN':'27','HOS':'28','JOL':'29',
'AMO':'30','OBA':'31','JON':'32','MIC':'33','NAM':'34','HAB':'35',
'ZEP':'36','HAG':'37','ZEC':'38','MAL':'39',
'MAT':'41','MRK':'42','LUK':'43','JHN':'44','ACT':'45',
'ROM':'46','1CO':'47','2CO':'48','GAL':'49','EPH':'50','PHP':'51',
'COL':'52','1TH':'53','2TH':'54','1TI':'55','2TI':'56','TIT':'57',
'PHM':'58','HEB':'59','JAS':'60','1PE':'61','2PE':'62','1JN':'63',
'2JN':'64',
'3JN':'65', 'JUD':'66', 'REV':'67' }
HELPER_PROGRAM_NAME = 'TN_ULT_Quotes_to_OLQuotes.js'
DEBUG_LEVEL = 1
book_data = OrderedDict()
errors = [['line', 'type', 'note']]
def get_book_data():
response = urllib.request.urlopen("https://git.door43.org/unfoldingWord/en_ult/raw/branch/master/01-GEN.usfm")
data = response.read() # a `bytes` object
book_usfm = data.decode('utf-8') # a `str`; this step can't be used if data is binary
unaligned_usfm = unalign_usfm(book_usfm)
book_html, warnings = SingleFilelessHtmlRenderer({"GEN": unaligned_usfm}).render()
html_verse_splits = re.split(r'(<span id="[^"]+-ch-0*(\d+)-v-(\d+(?:-\d+)?)" class="v-num">)', book_html)
usfm_chapter_splits = re.split(r'\\c ', unaligned_usfm)
usfm_verse_splits = None
chapter_verse_index = 0
for i in range(1, len(html_verse_splits), 4):
chapter = html_verse_splits[i+1]
verses = html_verse_splits[i+2]
if chapter not in book_data:
book_data[chapter] = OrderedDict()
usfm_chapter = f'\\c {usfm_chapter_splits[int(chapter)]}'
usfm_verse_splits = re.split(r'\\v ', usfm_chapter)
chapter_verse_index = 0
chapter_verse_index += 1
verse_usfm = f'\\v {usfm_verse_splits[chapter_verse_index]}'
verse_html = html_verse_splits[i] + html_verse_splits[i+3]
verse_html = re.split('<h2', verse_html)[0] # remove next chapter since only split on verses
verse_soup = BeautifulSoup(verse_html, 'html.parser')
for tag in verse_soup.find_all():
if (not tag.contents or len(tag.get_text(strip=True)) <= 0) and tag.name not in ['br', 'img']:
tag.decompose()
verse_html = str(verse_soup)
verses = re.findall(r'\d+', verses)
for verse in verses:
verse = verse.lstrip('0')
book_data[chapter][verse] = {
'usfm': verse_usfm,
'html': verse_html
}
def add_error(line:str, type:str, message:str):
errors.append([line, type, message])
def write_errors():
with open('tsv/errors.tsv', 'w', newline='\n') as csvfile:
writer = csv.writer(csvfile, delimiter='\t', quotechar='"', lineterminator="\n")
writer.writerows(errors)
def get_input_fields(input_folderpath:str, BBB:str) -> Tuple[str,str,str,str,str,str]:
"""
Generator to read the exported MS-Word .txt files
and return the needed fields.
Returns a 4-tuple with:
C,V, (ULT)verseText, (ULT)glQuote, note
"""
print(f" Loading {BBB} TN links from MS-Word exported text file…")
input_filepath = os.path.join(input_folderpath, f'{BBB}.txt')
Bbb = BBB[0] + BBB[1].lower() + BBB[2].lower()
C = V = '0'
verseText = glQuote = note = ''
occurrence = 0
occurrences = {}
with open(input_filepath, 'rt', encoding='utf-8') as input_text_file:
prevLine = ''
for line_number, line in enumerate(input_text_file, start=1):
if line_number == 1 and line.startswith('\ufeff'):
line = line[1:] # Remove optional BOM
line = line.rstrip('\n\r').strip().replace("\xa0", " ")
if line.isdigit():
print("LINE IS DIGIT!!! ", line)
newC = line
if int(line) != int(C)+1:
add_error(line_number, 'file', f"Chapter number is not increasing as expected: moving from {C} to {newC}")
V = '0'
C = newC
glQuote = note = verseText = ''
prevLine = line
continue
if line.startswith(f'{Bbb} {C}:'):
parts = line.split(' ')
print(parts)
newV = parts[1].split(':')[1]
print(line)
print(newV, V)
if int(newV) != int(V)+1:
add_error(line_number, "file", f"Verse number is not increasing as expected: moving from {V} to {newV}")
V = newV
verseText = ' '.join(parts[2:])
print(f"|{verseText}|")
print(book_data[C][V])
text = re.sub('<[^<]+?>', '', book_data[C][V]['html']).strip()
text = re.sub('^\d+ *', '', text)
text = re.sub(r'\s+', ' ', text)
print(f"?{text}?")
verseText = re.sub(r'\s+', ' ', verseText)
if verseText not in text:
diff = []
prev = ' '
for i,s in enumerate(difflib.ndiff(verseText,text)):
if s[0] != ' ':
if prev == s[0]:
diff[-1] = re.sub('`(.+)`', rf'`\1{s[-1]}`', diff[-1])
elif s[0]=='-':
diff.append(f'Delete `{s[-1]}` from position {i}')
elif s[0]=='+':
diff.append(f'Add `{s[-1]}` to position {i}')
prev = s[0]
diffstr = "\n\n".join(diff)
add_error(line_number, "verse", f'{BBB} {C}:{V}: Verse should read:\n> {text}\n\nNOT\n> {verseText}\n\n*DIFF:*\n\n{diffstr}')
occurrences = {}
glQuote = note = ''
prevLine = line
continue
if not line or 'Paragraph Break' in line or line.startswith(f'{C}:') or prevLine.startswith(f'{C}:'):
if glQuote and note:
yield C, V, verseText, glQuote, str(occurrence), note
glQuote = note = ''
occurrence = 0
prevLine = line
continue
if glQuote:
if note:
note += " "
note += line
prevLine = line
continue
glQuote = line
quote_count = len(re.findall(r'(^|\W|\b)' + re.escape(glQuote) + r'(\W|\b|$)', text))
if quote_count == 0:
add_error(line_number, "glQuote", f'{Bbb} {C}:{V}: GL Quote not found:\n```\n{glQuote}\n```\nnot in\n\n> {text}')
else:
words = glQuote.split(' ')
words_str = ''
for word in words:
if words_str:
words_str += ' '
words_str += word
if words_str not in occurrences:
occurrences[words_str] = 1
else:
occurrences[words_str] += 1
occurrence = occurrences[glQuote]
if quote_count < occurrence:
occurrence = quote_count
prevLine = line
continue
# if errors['glQuotres']:
# print("Please fix GL quotes so they match and try again.")
# sys.exit(1)
if glQuote and note:
yield C, V, verseText, glQuote, str(occurrence), note
# end of get_input_fields function
OrigL_QUOTE_PLACEHOLDER = "NO OrigLQuote AVAILABLE!!!"
def convert_MSWrd_TN_TSV(input_folderpath:str, output_folderpath:str, BBB:str, nn:str) -> int:
"""
Function to read the exported .txt file from MS-Word and write the TN markdown file.
Returns the number of unique GLQuotes that were written in the call.
"""
testament = 'OT' if int(nn)<40 else 'NT'
output_filepath = os.path.join(output_folderpath, f'en_tn_{nn}-{BBB}.tsv')
intros = {}
with open(output_filepath, 'r', encoding='utf-8') as output_TSV_file:
tsv_reader = csv.DictReader(output_TSV_file, delimiter="\t")
for row in tsv_reader:
if row["Verse"] == "intro":
intros[row["Chapter"]] = "\t".join(row.values()) + "\n"
temp_output_filepath = Path(f"{output_filepath}.tmp")
with open(temp_output_filepath, 'wt', encoding='utf-8') as temp_output_TSV_file:
previously_generated_ids:List[str] = [''] # We make ours unique per file (spec only used to say unique per verse)
temp_output_TSV_file.write('Book\tChapter\tVerse\tID\tSupportReference\tOrigQuote\tOccurrence\tGLQuote\tOccurrenceNote\n')
temp_output_TSV_file.write(intros['front'])
prev_C = '0'
for line_count, (C, V, verse_text, gl_quote, occurrence, note) in enumerate(get_input_fields(input_folderpath, BBB), start=1):
# print(f"Got {BBB} {C}:{V} '{note}' for '{gl_quote}' {occurrence} in: {verse_text}")
if C > prev_C and C in intros:
temp_output_TSV_file.write(intros[C])
prev_C = C
generated_id = ''
while generated_id in previously_generated_ids:
generated_id = random.choice('abcdefghijklmnopqrstuvwxyz') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789')
previously_generated_ids.append(generated_id)
support_reference = ''
orig_quote = OrigL_QUOTE_PLACEHOLDER
# Find "See:" TA refs and process them -- should only be one
for match in re.finditer(r'\(See: ([-A-Za-z0-9]+?)\)', note):
if support_reference:
add_error("-1", "format", f"{BBB} {C}:{V}: Should only be one TA ref: {note}")
support_reference = match.group(1)
note = f"{note[:match.start()]}(See: [[rc://en/ta/man/translate/{support_reference}]]){note[match.end():]}"
gl_quote = gl_quote.strip()
if (gl_quote.startswith('"')): gl_quote = f'{gl_quote[1:]}'
if (gl_quote.endswith('"')): gl_quote = f'{gl_quote[:-1]}'
if (gl_quote.startswith("'")): gl_quote = f'{gl_quote[1:]}'
if (gl_quote.endswith("'")): gl_quote = f'{gl_quote[:-1]}'
gl_quote = gl_quote.replace('" ', '').replace(' "', '').replace("' ", ' ').replace(" '", ' ').replace("'s", 's')
if '"' in gl_quote or "'" in gl_quote:
add_error(line_number, "format", f"{BBB} {C}:{V}: glQuote still has straight quote marks: '{gl_quote}'")
note = note.strip()
if (note.startswith('"')): note = f'{note[1:]}'
if (note.endswith('"')): note = f'{note[:-1]}'
note = note.replace('" ', '').replace(' "', '') \
.replace('".', '”.').replace('",', '”,') \
.replace('("', '(“').replace('")', '”)') \
.replace("' ", ' ').replace(" '", ' ').replace("'s", 's')
if '"' in note or "'" in note:
add_error("-1", "format", f"{BBB} {C}:{V}: note still has straight quote marks: '{note}'")
temp_output_TSV_file.write(f'{BBB}\t{C}\t{V}\t{generated_id}\t{support_reference}\t{orig_quote}\t{occurrence}\t{gl_quote}\t{note}\n')
# Now use Proskomma to find the ULT GLQuote fields for the OrigQuotes in the temporary outputted file
print(f" Running Proskomma to find OrigL quotes for {testament} {BBB}… (might take a few minutes)")
completed_process_result = subprocess.run(['node', HELPER_PROGRAM_NAME, temp_output_filepath, testament], capture_output=True)
# print(f"Proskomma {BBB} result was: {completed_process_result}")
if completed_process_result.returncode:
print(f" Proskomma {BBB} ERROR result was: {completed_process_result.returncode}")
if completed_process_result.stderr:
print(f" Proskomma {BBB} error output was:\n{completed_process_result.stderr.decode()}")
proskomma_output_string = completed_process_result.stdout.decode()
# print(f"Proskomma {BBB} output was: {proskomma_output_string}") # For debugging JS helper program only
output_lines = proskomma_output_string.split('\n')
if output_lines:
# Log any errors that occurred -- not really needed now coz they go to stderr
print_next_line_counter = 0
for output_line in output_lines:
if 'Error:' in output_line:
logging.error(output_line)
print_next_line_counter = 2 # Log this many following lines as well
elif print_next_line_counter > 0:
logging.error(output_line)
print_next_line_counter -= 1
print(f" Proskomma got: {' / '.join(output_lines[:9])}") # Displays the UHB/UGNT and ULT loading times
print(f" Proskomma did: {output_lines[-2]}")
else: logging.critical("No output from Proskomma!!!")
# Put the GL Quotes into a dict for easy access
match_dict = {}
for match in re.finditer(r'(\w{3})_(\d{1,3}):(\d{1,3}) ►(.+?)◄ “(.+?)”', proskomma_output_string):
B, C, V, gl_quote, orig_quote = match.groups()
assert B == BBB, f"{B} {C}:{V} '{orig_quote}' Should be equal '{B}' '{BBB}'"
if orig_quote:
match_dict[(C,V,gl_quote)] = orig_quote
else:
logging.error(f"{B} {C}:{V} '{gl_quote}' Should have gotten an OrigLQuote")
print(f" Got {len(match_dict):,} unique OrigL Quotes back from Proskomma for {BBB}")
match_count = fail_count = 0
if match_dict: # (if not, the ULT book probably isn't aligned yet)
# Now put the OrigL Quotes into the file
with open(temp_output_filepath, 'rt', encoding="utf-8") as temp_input_text_file:
with open(output_filepath, 'wt', encoding='utf-8') as output_TSV_file:
output_TSV_file.write(temp_input_text_file.readline()) # Write the TSV header
for line in temp_input_text_file:
print(line)
B, C, V, rowID, support_reference, orig_quote, occurrence, gl_quote, occurrence_note = line.split('\t')
try:
if gl_quote:
orig_quote = match_dict[(C,V,gl_quote)]
match_count += 1
except KeyError:
logging.error(f"Unable to find OrigLQuote for {BBB} {C}:{V} {rowID} '{gl_quote}'")
fail_count += 1
# orig_quote = orig_quote.replace('…',' … ').replace(' ',' ') # Put space around ellipsis in field intended for human readers
output_TSV_file.write(f'{B}\t{C}\t{V}\t{rowID}\t{support_reference}\t{orig_quote}\t{occurrence}\t{gl_quote}\t{occurrence_note}')
os.remove(temp_output_filepath)
return line_count, match_count, fail_count
# end of convert_TN_TSV
def main():
"""
Go through the list of Bible books
and convert them
while keeping track of some basic statistics
"""
print("TN_MSWrd_to_TSV9_via_Proskomma.py")
print(f" Source folderpath is {LOCAL_SOURCE_FOLDERPATH}/")
print(f" Output folderpath is {LOCAL_OUTPUT_FOLDERPATH}/")
total_files_read = total_files_written = 0
total_lines_read = total_quotes_written = 0
total_GLQuote_failures = 0
failed_book_list = []
for BBB, nn in BBB_NUMBER_DICT.items():
if BBB != 'GEN': continue # Just process this one book
# if BBB not in ('MAT','MRK','LUK','JHN', 'ACT',
# 'ROM','1CO','2CO','GAL','EPH','PHP','COL',
# '1TH','2TH','1TI','2TI','TIT','PHM',
# 'HEB','JAS','1PE','2PE','1JN','2JN','3JN','JUD','REV'):
# continue # Just process NT books
lines_read, this_note_count, fail_count = convert_MSWrd_TN_TSV(LOCAL_SOURCE_FOLDERPATH, LOCAL_OUTPUT_FOLDERPATH, BBB, nn)
total_lines_read += lines_read
total_files_read += 1
if this_note_count:
total_quotes_written += this_note_count
total_files_written += 1
total_GLQuote_failures += fail_count
print(f" {total_lines_read:,} lines read from {total_files_read} TSV file{'' if total_files_read==1 else 's'}")
print(f" {total_quotes_written:,} GL quotes written to {total_files_written} TSV file{'' if total_files_written==1 else 's'} in {LOCAL_OUTPUT_FOLDERPATH}/")
if total_GLQuote_failures:
print(f" Had a total of {total_GLQuote_failures:,} GLQuote failure{'' if total_GLQuote_failures==1 else 's'}!")
if failed_book_list:
logging.critical(f"{len(failed_book_list)} books failed completely: {failed_book_list}")
# end of main function
if __name__ == '__main__':
get_book_data()
main()
write_errors()
# end of TN_MSWrd_to_TSV9_via_Proskomma.py