2022-09-09 18:06:26 +00:00
#!/usr/bin/env python3
#
# TN_MSWrd_to_TSV9_via_Proskomma.py
#
# Copyright (c) 2021 unfoldingWord
# http://creativecommons.org/licenses/MIT/
# See LICENSE file for details.
#
# Contributors:
# Robert Hunt <Robert.Hunt@unfoldingword.org>
#
# Written Sept 2021 by RJH
# Last modified: 2021-09-21 by RJH
#
"""
Quick script to create 9 - column TN files from MS - Word files .
NOTE : This requires the addition of the OrigQuote column !
"""
from typing import List , Tuple
2022-10-06 20:33:54 +00:00
import sys
2022-09-09 18:06:26 +00:00
import os
2022-12-07 03:37:20 +00:00
import csv
2022-09-09 18:06:26 +00:00
from pathlib import Path
import random
import re
import logging
import subprocess
2022-12-07 03:37:20 +00:00
from collections import OrderedDict
import urllib . request
from usfm_utils import unalign_usfm
from tx_usfm_tools . singleFilelessHtmlRenderer import SingleFilelessHtmlRenderer
from bs4 import BeautifulSoup
2022-12-15 22:24:42 +00:00
import difflib
2022-09-09 18:06:26 +00:00
2022-12-03 00:40:18 +00:00
LOCAL_SOURCE_FOLDERPATH = ' txt '
2022-09-09 18:06:26 +00:00
# The output folder below must also already exist!
2022-12-03 00:40:18 +00:00
LOCAL_OUTPUT_FOLDERPATH = ' tsv '
2022-09-09 18:06:26 +00:00
BBB_NUMBER_DICT = { ' GEN ' : ' 01 ' , ' EXO ' : ' 02 ' , ' LEV ' : ' 03 ' , ' NUM ' : ' 04 ' , ' DEU ' : ' 05 ' ,
' JOS ' : ' 06 ' , ' JDG ' : ' 07 ' , ' RUT ' : ' 08 ' , ' 1SA ' : ' 09 ' , ' 2SA ' : ' 10 ' , ' 1KI ' : ' 11 ' ,
' 2KI ' : ' 12 ' , ' 1CH ' : ' 13 ' , ' 2CH ' : ' 14 ' , ' EZR ' : ' 15 ' ,
' NEH ' : ' 16 ' ,
' EST ' : ' 17 ' ,
' JOB ' : ' 18 ' , ' PSA ' : ' 19 ' , ' PRO ' : ' 20 ' , ' ECC ' : ' 21 ' , ' SNG ' : ' 22 ' , ' ISA ' : ' 23 ' ,
' JER ' : ' 24 ' , ' LAM ' : ' 25 ' , ' EZK ' : ' 26 ' , ' DAN ' : ' 27 ' , ' HOS ' : ' 28 ' , ' JOL ' : ' 29 ' ,
' AMO ' : ' 30 ' , ' OBA ' : ' 31 ' , ' JON ' : ' 32 ' , ' MIC ' : ' 33 ' , ' NAM ' : ' 34 ' , ' HAB ' : ' 35 ' ,
' ZEP ' : ' 36 ' , ' HAG ' : ' 37 ' , ' ZEC ' : ' 38 ' , ' MAL ' : ' 39 ' ,
' MAT ' : ' 41 ' , ' MRK ' : ' 42 ' , ' LUK ' : ' 43 ' , ' JHN ' : ' 44 ' , ' ACT ' : ' 45 ' ,
' ROM ' : ' 46 ' , ' 1CO ' : ' 47 ' , ' 2CO ' : ' 48 ' , ' GAL ' : ' 49 ' , ' EPH ' : ' 50 ' , ' PHP ' : ' 51 ' ,
' COL ' : ' 52 ' , ' 1TH ' : ' 53 ' , ' 2TH ' : ' 54 ' , ' 1TI ' : ' 55 ' , ' 2TI ' : ' 56 ' , ' TIT ' : ' 57 ' ,
' PHM ' : ' 58 ' , ' HEB ' : ' 59 ' , ' JAS ' : ' 60 ' , ' 1PE ' : ' 61 ' , ' 2PE ' : ' 62 ' , ' 1JN ' : ' 63 ' ,
' 2JN ' : ' 64 ' ,
' 3JN ' : ' 65 ' , ' JUD ' : ' 66 ' , ' REV ' : ' 67 ' }
HELPER_PROGRAM_NAME = ' TN_ULT_Quotes_to_OLQuotes.js '
DEBUG_LEVEL = 1
2022-12-07 03:37:20 +00:00
book_data = OrderedDict ( )
2022-12-07 03:43:31 +00:00
errors = [ [ ' line ' , ' type ' , ' note ' ] ]
2022-09-09 18:06:26 +00:00
2022-12-07 03:37:20 +00:00
def get_book_data ( ) :
response = urllib . request . urlopen ( " https://git.door43.org/unfoldingWord/en_ult/raw/branch/master/01-GEN.usfm " )
data = response . read ( ) # a `bytes` object
book_usfm = data . decode ( ' utf-8 ' ) # a `str`; this step can't be used if data is binary
unaligned_usfm = unalign_usfm ( book_usfm )
book_html , warnings = SingleFilelessHtmlRenderer ( { " GEN " : unaligned_usfm } ) . render ( )
html_verse_splits = re . split ( r ' (<span id= " [^ " ]+-ch-0*( \ d+)-v-( \ d+(?:- \ d+)?) " class= " v-num " >) ' , book_html )
usfm_chapter_splits = re . split ( r ' \\ c ' , unaligned_usfm )
usfm_verse_splits = None
chapter_verse_index = 0
for i in range ( 1 , len ( html_verse_splits ) , 4 ) :
chapter = html_verse_splits [ i + 1 ]
verses = html_verse_splits [ i + 2 ]
if chapter not in book_data :
book_data [ chapter ] = OrderedDict ( )
usfm_chapter = f ' \\ c { usfm_chapter_splits [ int ( chapter ) ] } '
usfm_verse_splits = re . split ( r ' \\ v ' , usfm_chapter )
chapter_verse_index = 0
chapter_verse_index + = 1
verse_usfm = f ' \\ v { usfm_verse_splits [ chapter_verse_index ] } '
verse_html = html_verse_splits [ i ] + html_verse_splits [ i + 3 ]
verse_html = re . split ( ' <h2 ' , verse_html ) [ 0 ] # remove next chapter since only split on verses
verse_soup = BeautifulSoup ( verse_html , ' html.parser ' )
for tag in verse_soup . find_all ( ) :
if ( not tag . contents or len ( tag . get_text ( strip = True ) ) < = 0 ) and tag . name not in [ ' br ' , ' img ' ] :
tag . decompose ( )
verse_html = str ( verse_soup )
verses = re . findall ( r ' \ d+ ' , verses )
for verse in verses :
verse = verse . lstrip ( ' 0 ' )
book_data [ chapter ] [ verse ] = {
' usfm ' : verse_usfm ,
' html ' : verse_html
}
def add_error ( line : str , type : str , message : str ) :
errors . append ( [ line , type , message ] )
def write_errors ( ) :
2023-01-18 06:19:32 +00:00
with open ( ' tsv/errors.tsv ' , ' w ' , newline = ' \n ' ) as csvfile :
2022-12-07 03:37:20 +00:00
writer = csv . writer ( csvfile , delimiter = ' \t ' , quotechar = ' " ' , lineterminator = " \n " )
writer . writerows ( errors )
2022-12-03 00:40:18 +00:00
def get_input_fields ( input_folderpath : str , BBB : str ) - > Tuple [ str , str , str , str , str , str ] :
2022-09-09 18:06:26 +00:00
"""
Generator to read the exported MS - Word . txt files
and return the needed fields .
Returns a 4 - tuple with :
C , V , ( ULT ) verseText , ( ULT ) glQuote , note
"""
print ( f " Loading { BBB } TN links from MS-Word exported text file… " )
2022-12-03 00:40:18 +00:00
input_filepath = os . path . join ( input_folderpath , f ' { BBB } .txt ' )
2022-09-09 18:06:26 +00:00
Bbb = BBB [ 0 ] + BBB [ 1 ] . lower ( ) + BBB [ 2 ] . lower ( )
2022-10-06 20:33:54 +00:00
C = V = ' 0 '
2022-09-09 18:06:26 +00:00
verseText = glQuote = note = ' '
2022-10-06 20:33:54 +00:00
occurrence = 0
occurrences = { }
with open ( input_filepath , ' rt ' , encoding = ' utf-8 ' ) as input_text_file :
2022-12-07 21:33:13 +00:00
prevLine = ' '
2022-09-09 18:06:26 +00:00
for line_number , line in enumerate ( input_text_file , start = 1 ) :
2022-10-06 20:33:54 +00:00
if line_number == 1 and line . startswith ( ' \ufeff ' ) :
line = line [ 1 : ] # Remove optional BOM
2022-12-07 03:37:20 +00:00
line = line . rstrip ( ' \n \r ' ) . strip ( ) . replace ( " \xa0 " , " " )
2022-10-06 20:33:54 +00:00
if line . isdigit ( ) :
2022-12-07 03:37:20 +00:00
print ( " LINE IS DIGIT!!! " , line )
2022-10-06 20:33:54 +00:00
newC = line
2022-12-03 00:44:02 +00:00
if int ( line ) != int ( C ) + 1 :
2022-12-07 03:37:20 +00:00
add_error ( line_number , ' file ' , f " Chapter number is not increasing as expected: moving from { C } to { newC } " )
2022-10-06 20:33:54 +00:00
V = ' 0 '
C = newC
glQuote = note = verseText = ' '
2022-12-07 21:33:13 +00:00
prevLine = line
2022-10-06 20:33:54 +00:00
continue
if line . startswith ( f ' { Bbb } { C } : ' ) :
parts = line . split ( ' ' )
2022-12-07 03:37:20 +00:00
print ( parts )
2022-10-06 20:33:54 +00:00
newV = parts [ 1 ] . split ( ' : ' ) [ 1 ]
2022-12-07 03:37:20 +00:00
print ( line )
print ( newV , V )
2022-12-03 00:44:02 +00:00
if int ( newV ) != int ( V ) + 1 :
2022-12-07 03:37:20 +00:00
add_error ( line_number , " file " , f " Verse number is not increasing as expected: moving from { V } to { newV } " )
2022-10-06 20:33:54 +00:00
V = newV
verseText = ' ' . join ( parts [ 2 : ] )
2022-12-07 03:37:20 +00:00
print ( f " | { verseText } | " )
print ( book_data [ C ] [ V ] )
text = re . sub ( ' <[^<]+?> ' , ' ' , book_data [ C ] [ V ] [ ' html ' ] ) . strip ( )
text = re . sub ( ' ^ \ d+ * ' , ' ' , text )
2022-12-07 21:53:22 +00:00
text = re . sub ( r ' \ s+ ' , ' ' , text )
2022-12-07 03:37:20 +00:00
print ( f " ? { text } ? " )
2022-12-07 21:53:22 +00:00
verseText = re . sub ( r ' \ s+ ' , ' ' , verseText )
2022-12-07 03:37:20 +00:00
if verseText not in text :
2022-12-15 22:24:42 +00:00
diff = [ ]
prev = ' '
for i , s in enumerate ( difflib . ndiff ( verseText , text ) ) :
if s [ 0 ] != ' ' :
if prev == s [ 0 ] :
2022-12-15 22:25:35 +00:00
diff [ - 1 ] = re . sub ( ' `(.+)` ' , rf ' ` \ 1 { s [ - 1 ] } ` ' , diff [ - 1 ] )
2022-12-15 22:24:42 +00:00
elif s [ 0 ] == ' - ' :
2022-12-15 22:25:35 +00:00
diff . append ( f ' Delete ` { s [ - 1 ] } ` from position { i } ' )
2022-12-15 22:24:42 +00:00
elif s [ 0 ] == ' + ' :
2022-12-15 22:25:35 +00:00
diff . append ( f ' Add ` { s [ - 1 ] } ` to position { i } ' )
2022-12-15 22:24:42 +00:00
prev = s [ 0 ]
diffstr = " \n \n " . join ( diff )
2022-12-15 22:28:01 +00:00
add_error ( line_number , " verse " , f ' { BBB } { C } : { V } : Verse should read: \n > { text } \n \n NOT \n > { verseText } \n \n *DIFF:* \n \n { diffstr } ' )
2022-10-06 20:33:54 +00:00
occurrences = { }
glQuote = note = ' '
2022-12-07 21:33:13 +00:00
prevLine = line
2022-10-06 20:33:54 +00:00
continue
2022-12-07 21:33:13 +00:00
if not line or ' Paragraph Break ' in line or line . startswith ( f ' { C } : ' ) or prevLine . startswith ( f ' { C } : ' ) :
2022-10-06 20:33:54 +00:00
if glQuote and note :
yield C , V , verseText , glQuote , str ( occurrence ) , note
glQuote = note = ' '
occurrence = 0
2022-12-07 21:33:13 +00:00
prevLine = line
2022-10-06 20:33:54 +00:00
continue
if glQuote :
if note :
note + = " "
note + = line
2022-12-07 21:33:13 +00:00
prevLine = line
2022-10-06 20:33:54 +00:00
continue
glQuote = line
2023-01-18 06:19:32 +00:00
quote_count = len ( re . findall ( r ' (^| \ W| \ b) ' + re . escape ( glQuote ) + r ' ( \ W| \ b|$) ' , text ) )
2022-12-07 03:37:20 +00:00
if quote_count == 0 :
2022-12-07 21:58:01 +00:00
add_error ( line_number , " glQuote " , f ' { Bbb } { C } : { V } : GL Quote not found: \n ``` \n { glQuote } \n ``` \n not in \n \n > { text } ' )
2022-09-09 18:06:26 +00:00
else :
2022-10-06 20:33:54 +00:00
words = glQuote . split ( ' ' )
words_str = ' '
for word in words :
if words_str :
words_str + = ' '
words_str + = word
if words_str not in occurrences :
occurrences [ words_str ] = 1
else :
occurrences [ words_str ] + = 1
occurrence = occurrences [ glQuote ]
if quote_count < occurrence :
occurrence = quote_count
2022-12-07 21:33:13 +00:00
prevLine = line
2022-10-06 20:33:54 +00:00
continue
2022-12-07 03:37:20 +00:00
# if errors['glQuotres']:
# print("Please fix GL quotes so they match and try again.")
# sys.exit(1)
2022-10-06 20:33:54 +00:00
2022-09-09 18:06:26 +00:00
if glQuote and note :
2022-10-06 20:33:54 +00:00
yield C , V , verseText , glQuote , str ( occurrence ) , note
2022-09-09 18:06:26 +00:00
# end of get_input_fields function
OrigL_QUOTE_PLACEHOLDER = " NO OrigLQuote AVAILABLE!!! "
2022-12-03 00:40:18 +00:00
def convert_MSWrd_TN_TSV ( input_folderpath : str , output_folderpath : str , BBB : str , nn : str ) - > int :
2022-09-09 18:06:26 +00:00
"""
Function to read the exported . txt file from MS - Word and write the TN markdown file .
Returns the number of unique GLQuotes that were written in the call .
"""
testament = ' OT ' if int ( nn ) < 40 else ' NT '
2022-12-03 00:40:18 +00:00
output_filepath = os . path . join ( output_folderpath , f ' en_tn_ { nn } - { BBB } .tsv ' )
2023-02-02 01:25:38 +00:00
intros = { }
with open ( output_filepath , ' r ' , encoding = ' utf-8 ' ) as output_TSV_file :
tsv_reader = csv . DictReader ( output_TSV_file , delimiter = " \t " )
for row in tsv_reader :
if row [ " Verse " ] == " intro " :
intros [ row [ " Chapter " ] ] = " \t " . join ( row . values ( ) ) + " \n "
2022-09-09 18:06:26 +00:00
temp_output_filepath = Path ( f " { output_filepath } .tmp " )
2022-10-06 20:33:54 +00:00
with open ( temp_output_filepath , ' wt ' , encoding = ' utf-8 ' ) as temp_output_TSV_file :
2022-09-09 18:06:26 +00:00
previously_generated_ids : List [ str ] = [ ' ' ] # We make ours unique per file (spec only used to say unique per verse)
temp_output_TSV_file . write ( ' Book \t Chapter \t Verse \t ID \t SupportReference \t OrigQuote \t Occurrence \t GLQuote \t OccurrenceNote \n ' )
2023-02-02 01:25:38 +00:00
temp_output_TSV_file . write ( intros [ ' front ' ] )
prev_C = ' 0 '
2022-10-06 20:33:54 +00:00
for line_count , ( C , V , verse_text , gl_quote , occurrence , note ) in enumerate ( get_input_fields ( input_folderpath , BBB ) , start = 1 ) :
# print(f"Got {BBB} {C}:{V} '{note}' for '{gl_quote}' {occurrence} in: {verse_text}")
2023-02-02 01:25:38 +00:00
if C > prev_C and C in intros :
temp_output_TSV_file . write ( intros [ C ] )
prev_C = C
2022-09-09 18:06:26 +00:00
generated_id = ' '
while generated_id in previously_generated_ids :
generated_id = random . choice ( ' abcdefghijklmnopqrstuvwxyz ' ) + random . choice ( ' abcdefghijklmnopqrstuvwxyz0123456789 ' ) + random . choice ( ' abcdefghijklmnopqrstuvwxyz0123456789 ' ) + random . choice ( ' abcdefghijklmnopqrstuvwxyz0123456789 ' )
previously_generated_ids . append ( generated_id )
support_reference = ' '
orig_quote = OrigL_QUOTE_PLACEHOLDER
# Find "See:" TA refs and process them -- should only be one
for match in re . finditer ( r ' \ (See: ([-A-Za-z0-9]+?) \ ) ' , note ) :
2022-12-07 03:37:20 +00:00
if support_reference :
add_error ( " -1 " , " format " , f " { BBB } { C } : { V } : Should only be one TA ref: { note } " )
2022-09-09 18:06:26 +00:00
support_reference = match . group ( 1 )
note = f " { note [ : match . start ( ) ] } (See: [[rc://en/ta/man/translate/ { support_reference } ]]) { note [ match . end ( ) : ] } "
gl_quote = gl_quote . strip ( )
if ( gl_quote . startswith ( ' " ' ) ) : gl_quote = f ' “ { gl_quote [ 1 : ] } '
if ( gl_quote . endswith ( ' " ' ) ) : gl_quote = f ' { gl_quote [ : - 1 ] } ” '
if ( gl_quote . startswith ( " ' " ) ) : gl_quote = f ' ‘ { gl_quote [ 1 : ] } '
if ( gl_quote . endswith ( " ' " ) ) : gl_quote = f ' { gl_quote [ : - 1 ] } ’ '
gl_quote = gl_quote . replace ( ' " ' , ' ” ' ) . replace ( ' " ' , ' “ ' ) . replace ( " ' " , ' ’ ' ) . replace ( " ' " , ' ‘ ' ) . replace ( " ' s " , ' ’ s' )
2022-12-03 00:44:02 +00:00
if ' " ' in gl_quote or " ' " in gl_quote :
2022-12-07 03:37:20 +00:00
add_error ( line_number , " format " , f " { BBB } { C } : { V } : glQuote still has straight quote marks: ' { gl_quote } ' " )
2022-09-09 18:06:26 +00:00
note = note . strip ( )
if ( note . startswith ( ' " ' ) ) : note = f ' “ { note [ 1 : ] } '
if ( note . endswith ( ' " ' ) ) : note = f ' { note [ : - 1 ] } ” '
note = note . replace ( ' " ' , ' ” ' ) . replace ( ' " ' , ' “ ' ) \
. replace ( ' " . ' , ' ”. ' ) . replace ( ' " , ' , ' ”, ' ) \
. replace ( ' ( " ' , ' (“ ' ) . replace ( ' " ) ' , ' ”) ' ) \
. replace ( " ' " , ' ’ ' ) . replace ( " ' " , ' ‘ ' ) . replace ( " ' s " , ' ’ s' )
2022-12-03 00:44:02 +00:00
if ' " ' in note or " ' " in note :
2022-12-07 03:37:20 +00:00
add_error ( " -1 " , " format " , f " { BBB } { C } : { V } : note still has straight quote marks: ' { note } ' " )
2022-09-09 18:06:26 +00:00
temp_output_TSV_file . write ( f ' { BBB } \t { C } \t { V } \t { generated_id } \t { support_reference } \t { orig_quote } \t { occurrence } \t { gl_quote } \t { note } \n ' )
# Now use Proskomma to find the ULT GLQuote fields for the OrigQuotes in the temporary outputted file
print ( f " Running Proskomma to find OrigL quotes for { testament } { BBB } … (might take a few minutes) " )
2023-02-02 01:25:38 +00:00
2022-09-09 18:06:26 +00:00
completed_process_result = subprocess . run ( [ ' node ' , HELPER_PROGRAM_NAME , temp_output_filepath , testament ] , capture_output = True )
2022-12-03 00:40:18 +00:00
# print(f"Proskomma {BBB} result was: {completed_process_result}")
2022-09-09 18:06:26 +00:00
if completed_process_result . returncode :
print ( f " Proskomma { BBB } ERROR result was: { completed_process_result . returncode } " )
if completed_process_result . stderr :
print ( f " Proskomma { BBB } error output was: \n { completed_process_result . stderr . decode ( ) } " )
proskomma_output_string = completed_process_result . stdout . decode ( )
2022-12-03 00:45:56 +00:00
# print(f"Proskomma {BBB} output was: {proskomma_output_string}") # For debugging JS helper program only
2022-09-09 18:06:26 +00:00
output_lines = proskomma_output_string . split ( ' \n ' )
if output_lines :
# Log any errors that occurred -- not really needed now coz they go to stderr
print_next_line_counter = 0
for output_line in output_lines :
if ' Error: ' in output_line :
logging . error ( output_line )
print_next_line_counter = 2 # Log this many following lines as well
elif print_next_line_counter > 0 :
logging . error ( output_line )
print_next_line_counter - = 1
print ( f " Proskomma got: { ' / ' . join ( output_lines [ : 9 ] ) } " ) # Displays the UHB/UGNT and ULT loading times
print ( f " Proskomma did: { output_lines [ - 2 ] } " )
else : logging . critical ( " No output from Proskomma!!! " )
# Put the GL Quotes into a dict for easy access
match_dict = { }
for match in re . finditer ( r ' ( \ w {3} )_( \ d { 1,3}):( \ d { 1,3}) ►(.+?)◄ “(.+?)” ' , proskomma_output_string ) :
B , C , V , gl_quote , orig_quote = match . groups ( )
assert B == BBB , f " { B } { C } : { V } ' { orig_quote } ' Should be equal ' { B } ' ' { BBB } ' "
if orig_quote :
match_dict [ ( C , V , gl_quote ) ] = orig_quote
else :
logging . error ( f " { B } { C } : { V } ' { gl_quote } ' Should have gotten an OrigLQuote " )
print ( f " Got { len ( match_dict ) : , } unique OrigL Quotes back from Proskomma for { BBB } " )
match_count = fail_count = 0
if match_dict : # (if not, the ULT book probably isn't aligned yet)
# Now put the OrigL Quotes into the file
2022-10-06 20:33:54 +00:00
with open ( temp_output_filepath , ' rt ' , encoding = " utf-8 " ) as temp_input_text_file :
with open ( output_filepath , ' wt ' , encoding = ' utf-8 ' ) as output_TSV_file :
2022-09-09 18:06:26 +00:00
output_TSV_file . write ( temp_input_text_file . readline ( ) ) # Write the TSV header
for line in temp_input_text_file :
2023-02-02 01:25:38 +00:00
print ( line )
2022-09-09 18:06:26 +00:00
B , C , V , rowID , support_reference , orig_quote , occurrence , gl_quote , occurrence_note = line . split ( ' \t ' )
try :
if gl_quote :
orig_quote = match_dict [ ( C , V , gl_quote ) ]
match_count + = 1
except KeyError :
logging . error ( f " Unable to find OrigLQuote for { BBB } { C } : { V } { rowID } ' { gl_quote } ' " )
fail_count + = 1
# orig_quote = orig_quote.replace('…',' … ').replace(' ',' ') # Put space around ellipsis in field intended for human readers
output_TSV_file . write ( f ' { B } \t { C } \t { V } \t { rowID } \t { support_reference } \t { orig_quote } \t { occurrence } \t { gl_quote } \t { occurrence_note } ' )
os . remove ( temp_output_filepath )
return line_count , match_count , fail_count
# end of convert_TN_TSV
def main ( ) :
"""
Go through the list of Bible books
and convert them
while keeping track of some basic statistics
"""
print ( " TN_MSWrd_to_TSV9_via_Proskomma.py " )
print ( f " Source folderpath is { LOCAL_SOURCE_FOLDERPATH } / " )
print ( f " Output folderpath is { LOCAL_OUTPUT_FOLDERPATH } / " )
total_files_read = total_files_written = 0
total_lines_read = total_quotes_written = 0
total_GLQuote_failures = 0
failed_book_list = [ ]
2022-12-07 03:37:20 +00:00
for BBB , nn in BBB_NUMBER_DICT . items ( ) :
2022-09-09 18:06:26 +00:00
if BBB != ' GEN ' : continue # Just process this one book
# if BBB not in ('MAT','MRK','LUK','JHN', 'ACT',
# 'ROM','1CO','2CO','GAL','EPH','PHP','COL',
# '1TH','2TH','1TI','2TI','TIT','PHM',
# 'HEB','JAS','1PE','2PE','1JN','2JN','3JN','JUD','REV'):
# continue # Just process NT books
2022-12-07 03:37:20 +00:00
lines_read , this_note_count , fail_count = convert_MSWrd_TN_TSV ( LOCAL_SOURCE_FOLDERPATH , LOCAL_OUTPUT_FOLDERPATH , BBB , nn )
2022-09-09 18:06:26 +00:00
total_lines_read + = lines_read
total_files_read + = 1
if this_note_count :
total_quotes_written + = this_note_count
total_files_written + = 1
total_GLQuote_failures + = fail_count
print ( f " { total_lines_read : , } lines read from { total_files_read } TSV file { ' ' if total_files_read == 1 else ' s ' } " )
print ( f " { total_quotes_written : , } GL quotes written to { total_files_written } TSV file { ' ' if total_files_written == 1 else ' s ' } in { LOCAL_OUTPUT_FOLDERPATH } / " )
if total_GLQuote_failures :
print ( f " Had a total of { total_GLQuote_failures : , } GLQuote failure { ' ' if total_GLQuote_failures == 1 else ' s ' } ! " )
if failed_book_list :
logging . critical ( f " { len ( failed_book_list ) } books failed completely: { failed_book_list } " )
# end of main function
if __name__ == ' __main__ ' :
2022-12-07 03:37:20 +00:00
get_book_data ( )
2022-09-09 18:06:26 +00:00
main ( )
2022-12-07 03:37:20 +00:00
write_errors ( )
2022-09-09 18:06:26 +00:00
# end of TN_MSWrd_to_TSV9_via_Proskomma.py