richmahn_GEN_MSWord_notes/TN_TSV9_to_TSV7.py

191 lines
8.8 KiB
Python
Executable File

#!/usr/bin/env python3
#
# TN_TSV9_to_TSV7.py
#
# Copyright (c) 2020-2021 unfoldingWord
# http://creativecommons.org/licenses/MIT/
# See LICENSE file for details.
#
# Contributors:
# Robert Hunt <Robert.Hunt@unfoldingword.org>
#
# Written Aug 2020 by RJH
# Last modified: 2021-06-28 by RJH
#
"""
Quick script to copy TN from 9-column TSV files
and put into a TSV file with the new 7-column format.
"""
from typing import List, Tuple
import os
from pathlib import Path
import random
import re
import logging
LOCAL_SOURCE_BASE_FOLDERPATH = Path('./')
LOCAL_SOURCE_FOLDERPATH = LOCAL_SOURCE_BASE_FOLDERPATH.joinpath('tsv/')
# The output folder below must also already exist!
LOCAL_OUTPUT_FOLDERPATH = LOCAL_SOURCE_BASE_FOLDERPATH.joinpath('tsv/')
BBB_NUMBER_DICT = {'GEN':'01'}
def get_source_lines(BBB:str, nn:str) -> Tuple[str,str,str,str,str,str,str]:
"""
Generator to read the TN TSV files
and return lines containing the fields.
Returns a 5-tuple with:
line number B C V reference strings
actual line (without trailing nl)
"""
source_filename = f'en_tn_{nn}-{BBB}.tsv'
source_filepath = LOCAL_SOURCE_FOLDERPATH.joinpath(source_filename)
print(f" Getting source lines from {source_filepath}")
with open(source_filepath, 'rt') as source_tsv_file:
for line_number,line in enumerate(source_tsv_file, start=1):
#line = line.rstrip() # Remove trailing whitespace including nl char
# print(f" line={line}")
# if not line: continue # Ignore blank lines
fields = line.split('\t')
yield line_number, fields
# end of get_source_lines function
def make_TSV_file(BBB:str, nn:str) -> int:
"""
Combines chapter and verse number into reference
Does a little checking and cleaning of other fields
Drops the GL Quote field
Writes the 7-column TSV file
Returns the number of lines written
"""
print(f" Converting TN {BBB} links to TSV…")
output_folderpath = LOCAL_OUTPUT_FOLDERPATH #.joinpath(BBB)
if not os.path.isdir(output_folderpath): os.mkdir(output_folderpath)
output_filepath = output_folderpath.joinpath(f'tn_{BBB}.tsv')
num_lines = num_errors = j = 0
rowID_list = []
with open(output_filepath, 'wt') as output_TSV_file:
for j, (line_number,fields) in enumerate(get_source_lines(BBB, nn), start=1):
try:
B,C,V,ID, support_reference,orig_quote,occurrence,gl_quote,occurrence_note = fields
except ValueError:
num_errors += 1
logging.critical(f"Expected 9 fields but found {len(fields)} in {BBB} with {fields}")
continue
# raise ValueError # Better if we continue and process the rest of the file ???
# print(f"{j:3}/ Line {line_number:<5} {BBB} {C:>3}:{V:<3} {ID }'{support_reference}' '{orig_quote}' '{occurrence}' '{_gl_quote}' '{occurrence_note}'")
if j == 1:
assert B=='Book' and C=='Chapter' and V=='Verse' # etc.
output_line = 'Reference\tID\tTags\tSupportReference\tQuote\tOccurrence\tNote'
else:
# Do some tidying up while we're at it
C = C.strip(); V = V.strip(); ID = ID.strip()
reference = f'{C}:{V}'
if len(ID) != 4:
num_errors += 1
logging.critical(f"Expected {BBB} {C}:{V} row ID to be 4 characters (not {len(ID)} characters with '{ID}')")
ID = random.choice('abcdefghijklmnopqrstuvwxyz') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789')
if ID[0] not in 'abcdefghijklmnopqrstuvwxyz':
print(f"Bad ID: {BBB} {reference} {line_number} '{ID}' fixed.")
convert_dict = {'1':'a', '2':'t', '3':'c', '4':'d', '5':'f', '6':'g', '7':'s', '8':'h', '9':'n', '0':'z' }
ID = f"{convert_dict[ID[0]]}{ID[1:]}" # We don't use i l o (more easily confused)
assert ID[0] in 'abcdefghijklmnopqrstuvwxyz'
assert ID[1] in 'abcdefghijklmnopqrstuvwxyz0123456789'
assert ID[2] in 'abcdefghijklmnopqrstuvwxyz0123456789'
assert ID[3] in 'abcdefghijklmnopqrstuvwxyz0123456789'
if ID in rowID_list:
logging.error(f"RowID '{ID}' is duplicated in {BBB} file")
else:
rowID_list.append(ID)
tags = ''
support_reference = support_reference.strip()
if support_reference: support_reference = f'rc://*/ta/man/translate/{support_reference}'
orig_quote = orig_quote.replace('\u00A0', ' ') # Replace non-break spaces
orig_quote = orig_quote.replace('\u200B', '') # Delete zero-width spaces
orig_quote = orig_quote.replace('...', '')
orig_quote = orig_quote.replace('', '').replace('', '')
orig_quote = orig_quote.strip('') # Should only be BETWEEN words
orig_quote = orig_quote.replace('', ' & ')
orig_quote = orig_quote.strip()
if not orig_quote and occurrence != '0':
logging.error(f"Expected occurrence=='0' for {BBB} {reference} {ID} {support_reference} '{orig_quote}' {occurrence} '{gl_quote}'")
if occurrence == '0' and orig_quote:
logging.error(f"Expected no orig_quote for {BBB} {reference} {ID} {support_reference} '{orig_quote}' {occurrence} '{gl_quote}'")
occurrence = occurrence.strip()
occurrence_note = occurrence_note.strip()
occurrence_note = occurrence_note.replace('<BR>', '<br>')
if occurrence_note.startswith('<br>'): occurrence_note = occurrence_note[4:]
if occurrence_note.endswith('<br>'): occurrence_note = occurrence_note[:-4]
occurrence_note = occurrence_note.replace('<br>', '\\n')
occurrence_note = occurrence_note.replace('rc://en/', 'rc://*/')
occurrence_note = occurrence_note.replace('', '').replace('', '').replace('', '')
while '* ' in occurrence_note: occurrence_note = occurrence_note.replace('* ', '* ')
occurrence_note = occurrence_note.replace('\\n ', '\\n@@@').replace('\\n ', '\\n@@')
occurrence_note = occurrence_note.replace(' ', ' ') # Might mess up markdown indents ???
occurrence_note = occurrence_note.replace('\\n@@@', '\\n ').replace('\\n@@', '\\n ')
occurrence_note = occurrence_note.strip()
if ' ' in occurrence_note and ' *' not in occurrence_note: # used in markdown for indenting
print(f"NOTE: {BBB} {reference} {line_number} OccurrenceNote has unexpected double-spaces: '{occurrence_note}'")
# Normally GL Quote is a Bible quote
gl_quote = gl_quote.strip()
if gl_quote == 'Connecting Statement:'\
or gl_quote == 'General Information:'\
or gl_quote == 'A Bible story from':
occurrence_note = f"# {gl_quote}\\n\\n{occurrence_note}"
gl_quote = ''
output_line = f'{reference}\t{ID}\t{tags}\t{support_reference}\t{orig_quote}\t{occurrence}\t{occurrence_note}'
output_TSV_file.write(f'{output_line}\n')
num_lines += 1
print(f" {num_lines:,} lines written")
return num_lines, num_errors
# end of make_TSV_file function
def main():
"""
"""
print("TN_TSV9_to_TSV7.py")
print(f" Source folderpath is {LOCAL_SOURCE_BASE_FOLDERPATH}/")
print(f" Output folderpath is {LOCAL_OUTPUT_FOLDERPATH}/")
total_questions = num_major_errors = 0
failed_books = []
for BBB,nn in BBB_NUMBER_DICT.items():
try:
question_count, major_error_count = make_TSV_file(BBB,nn)
if major_error_count:
logging.critical(f"Had {major_error_count} major error{'' if major_error_count==1 else 's'} in {BBB} (i.e., possible lost notes)")
total_questions += question_count
num_major_errors += major_error_count
except (ValueError, AssertionError) as err:
print(f"ERROR: Failed to process {BBB}: {err}")
failed_books.append(BBB)
print(f" {total_questions:,} total notes written to {LOCAL_OUTPUT_FOLDERPATH}/")
if failed_books:
logging.critical(f"Had {len(failed_books)} failed books: {failed_books}")
if num_major_errors:
logging.critical(f"Had {num_major_errors} major error{'' if num_major_errors==1 else 's'} in various books (i.e., possible lost notes)")
# end of main function
if __name__ == '__main__':
main()
# end of TN_TSV9_to_TSV7.py