richmahn_GEN_MSWord_notes/TN_TSV9_to_TSV7.py

#!/usr/bin/env python3
#
# TN_TSV9_to_TSV7.py
#
# Copyright (c) 2020-2021 unfoldingWord
# http://creativecommons.org/licenses/MIT/
# See LICENSE file for details.
#
# Contributors:
#   Robert Hunt <Robert.Hunt@unfoldingword.org>
#
# Written Aug 2020 by RJH
#   Last modified: 2021-06-28 by RJH
#
"""
Quick script to copy TN from 9-column TSV files
    and put into a TSV file with the new 7-column format.
"""
from typing import List, Tuple
import os
from pathlib import Path
import random
import re
import logging


LOCAL_SOURCE_BASE_FOLDERPATH = Path('./')
LOCAL_SOURCE_FOLDERPATH = LOCAL_SOURCE_BASE_FOLDERPATH.joinpath('tsv/')

# The output folder below must also already exist!
LOCAL_OUTPUT_FOLDERPATH = LOCAL_SOURCE_BASE_FOLDERPATH.joinpath('tsv/')

BBB_NUMBER_DICT = {'GEN':'01'}


def get_source_lines(BBB:str, nn:str) -> Tuple[str,str,str,str,str,str,str]:
    """
    Generator to read the TN TSV files
        and return lines containing the fields.

    Returns a 5-tuple with:
        line number B C V reference strings
        actual line (without trailing nl)
    """
    source_filename = f'en_tn_{nn}-{BBB}.tsv'
    source_filepath = LOCAL_SOURCE_FOLDERPATH.joinpath(source_filename)
    print(f"      Getting source lines from {source_filepath}")

    with open(source_filepath, 'rt') as source_tsv_file:
        for line_number,line in enumerate(source_tsv_file, start=1):
            #line = line.rstrip() # Remove trailing whitespace including nl char
            # print(f"  line={line}")
            # if not line: continue # Ignore blank lines
            fields = line.split('\t')
            yield line_number, fields
# end of get_source_lines function


def make_TSV_file(BBB:str, nn:str) -> int:
    """
    Combines chapter and verse number into reference

    Does a little checking and cleaning of other fields

    Drops the GL Quote field

    Writes the 7-column TSV file

    Returns the number of lines written
    """
    print(f"    Converting TN {BBB} links to TSV…")
    output_folderpath = LOCAL_OUTPUT_FOLDERPATH #.joinpath(BBB)
    if not os.path.isdir(output_folderpath): os.mkdir(output_folderpath)
    output_filepath = output_folderpath.joinpath(f'tn_{BBB}.tsv')
    num_lines = num_errors = j = 0
    rowID_list = []
    with open(output_filepath, 'wt') as output_TSV_file:
        for j, (line_number,fields) in enumerate(get_source_lines(BBB, nn), start=1):
            try:
                B,C,V,ID, support_reference,orig_quote,occurrence,gl_quote,occurrence_note = fields
            except ValueError:
                num_errors += 1
                logging.critical(f"Expected 9 fields but found {len(fields)} in {BBB} with {fields}")
                continue
                # raise ValueError # Better if we continue and process the rest of the file ???
            # print(f"{j:3}/ Line {line_number:<5} {BBB} {C:>3}:{V:<3} {ID }'{support_reference}' '{orig_quote}' '{occurrence}' '{_gl_quote}' '{occurrence_note}'")
            if j == 1:
                assert B=='Book' and C=='Chapter' and V=='Verse' # etc.
                output_line = 'Reference\tID\tTags\tSupportReference\tQuote\tOccurrence\tNote'
            else:
                # Do some tidying up while we're at it
                C = C.strip(); V = V.strip(); ID = ID.strip()
                reference = f'{C}:{V}'

                if len(ID) != 4:
                    num_errors += 1
                    logging.critical(f"Expected {BBB} {C}:{V} row ID to be 4 characters (not {len(ID)} characters with '{ID}')")
                    ID = random.choice('abcdefghijklmnopqrstuvwxyz') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789') + random.choice('abcdefghijklmnopqrstuvwxyz0123456789')
                if ID[0] not in 'abcdefghijklmnopqrstuvwxyz':
                    print(f"Bad ID: {BBB} {reference} {line_number} '{ID}' fixed.")
                    convert_dict = {'1':'a', '2':'t', '3':'c', '4':'d', '5':'f', '6':'g', '7':'s', '8':'h', '9':'n', '0':'z' }
                    ID = f"{convert_dict[ID[0]]}{ID[1:]}" # We don't use i l o (more easily confused)
                assert ID[0] in 'abcdefghijklmnopqrstuvwxyz'
                assert ID[1] in 'abcdefghijklmnopqrstuvwxyz0123456789'
                assert ID[2] in 'abcdefghijklmnopqrstuvwxyz0123456789'
                assert ID[3] in 'abcdefghijklmnopqrstuvwxyz0123456789'
                if ID in rowID_list:
                    logging.error(f"RowID '{ID}' is duplicated in {BBB} file")
                else:
                    rowID_list.append(ID)

                tags = ''

                support_reference = support_reference.strip()
                if support_reference: support_reference = f'rc://*/ta/man/translate/{support_reference}'

                orig_quote = orig_quote.replace('\u00A0', ' ') # Replace non-break spaces
                orig_quote = orig_quote.replace('\u200B', '') # Delete zero-width spaces
                orig_quote = orig_quote.replace('...', '…')
                orig_quote = orig_quote.replace(' …', '…').replace('… ', '…')
                orig_quote = orig_quote.strip('…') # Should only be BETWEEN words
                orig_quote = orig_quote.replace('…', ' & ')
                orig_quote = orig_quote.strip()

                if not orig_quote and occurrence != '0':
                    logging.error(f"Expected occurrence=='0' for {BBB} {reference} {ID} {support_reference} '{orig_quote}' {occurrence} '{gl_quote}'")
                if occurrence == '0' and orig_quote:
                    logging.error(f"Expected no orig_quote for {BBB} {reference} {ID} {support_reference} '{orig_quote}' {occurrence} '{gl_quote}'")

                occurrence = occurrence.strip()

                occurrence_note = occurrence_note.strip()
                occurrence_note = occurrence_note.replace('<BR>', '<br>')
                if occurrence_note.startswith('<br>'): occurrence_note = occurrence_note[4:]
                if occurrence_note.endswith('<br>'): occurrence_note = occurrence_note[:-4]
                occurrence_note = occurrence_note.replace('<br>', '\\n')
                occurrence_note = occurrence_note.replace('rc://en/', 'rc://*/')
                occurrence_note = occurrence_note.replace('…', ' … ').replace('  …', ' …').replace('…  ', '… ')
                while '*  ' in occurrence_note: occurrence_note = occurrence_note.replace('*  ', '* ')
                occurrence_note = occurrence_note.replace('\\n   ', '\\n@@@').replace('\\n  ', '\\n@@')
                occurrence_note = occurrence_note.replace('  ', ' ') # Might mess up markdown indents ???
                occurrence_note = occurrence_note.replace('\\n@@@', '\\n   ').replace('\\n@@', '\\n  ')
                occurrence_note = occurrence_note.strip()
                if '  ' in occurrence_note and '  *' not in occurrence_note: # used in markdown for indenting
                    print(f"NOTE: {BBB} {reference} {line_number} OccurrenceNote has unexpected double-spaces: '{occurrence_note}'")

                # Normally GL Quote is a Bible quote
                gl_quote = gl_quote.strip()
                if gl_quote == 'Connecting Statement:'\
                or gl_quote == 'General Information:'\
                or gl_quote == 'A Bible story from':
                    occurrence_note = f"# {gl_quote}\\n\\n{occurrence_note}"
                    gl_quote = ''

                output_line = f'{reference}\t{ID}\t{tags}\t{support_reference}\t{orig_quote}\t{occurrence}\t{occurrence_note}'
            output_TSV_file.write(f'{output_line}\n')
            num_lines += 1
    print(f"      {num_lines:,} lines written")
    return num_lines, num_errors
# end of make_TSV_file function


def main():
    """
    """
    print("TN_TSV9_to_TSV7.py")
    print(f"  Source folderpath is {LOCAL_SOURCE_BASE_FOLDERPATH}/")
    print(f"  Output folderpath is {LOCAL_OUTPUT_FOLDERPATH}/")
    total_questions = num_major_errors = 0
    failed_books = []
    for BBB,nn in BBB_NUMBER_DICT.items():
        try:
            question_count, major_error_count = make_TSV_file(BBB,nn)
            if major_error_count:
                logging.critical(f"Had {major_error_count} major error{'' if major_error_count==1 else 's'} in {BBB} (i.e., possible lost notes)")
            total_questions += question_count
            num_major_errors += major_error_count
        except (ValueError, AssertionError) as err:
            print(f"ERROR: Failed to process {BBB}: {err}")
            failed_books.append(BBB)
    print(f"    {total_questions:,} total notes written to {LOCAL_OUTPUT_FOLDERPATH}/")
    if failed_books:
        logging.critical(f"Had {len(failed_books)} failed books: {failed_books}")
    if num_major_errors:
        logging.critical(f"Had {num_major_errors} major error{'' if num_major_errors==1 else 's'} in various books (i.e., possible lost notes)")
# end of main function

if __name__ == '__main__':
    main()
# end of TN_TSV9_to_TSV7.py