en_btr_backend/app/Handlers/UlbXmlImportHandler.php

515 lines
16 KiB
PHP

<?php
namespace App\Handlers;
use App\Book;
use App\Chapter;
use App\Helpers\Traits\BookTitleHelperTrait;
use App\Word;
use App\Verse;
use DOMDocument;
use DOMElement;
use DOMNodeList;
use Illuminate\Console\OutputStyle;
use Symfony\Component\Console\Output\OutputInterface;
/**
* UlbXmlImportHandler.php
*
* @author: Leonard Smith <leonard@acornwebconsultants.com>
* Date: 10/2/20
* Time: 8:04 AM
*/
class UlbXmlImportHandler
{
use BookXmlFilesTrait, BookTitleHelperTrait;
const REPO_FOLDER = '/ulb/';
/**
* @var DOMDocument
*/
protected $document;
/**
* @var int
*/
protected $currentGNTSort = 0;
/**
* @var int
*/
protected $currentULBSort = 0;
/**
* The following four integer variables provide state tracking
* for building ULB and GNT sort data. Instead of relying solely on
* OGNTSort which is pulled from the OpenGNT data sources, we need
* to create our own sort indices so that the we can resort data
* on the fly.
*
* @var int
*/
protected $currentBookNumber = 0;
/**
* @var int
*/
protected $currentChapterNumber = 0;
/**
* @var int
*/
protected $currentVerseNumber = 0;
/**
* @var int
*/
protected $currentWordNumber = 0;
/**
* @var OutputStyle
*/
protected $output;
/**
* @var array
*/
protected $availableFiles = [];
public function __construct(OutputStyle $output = null)
{
$this->output = $output;
}
public function run() : void
{
// Collect available xml files
foreach (self::$bookXmlFiles as $name => $filename)
{
$filepath = $this->getBookXmlFilePath(self::REPO_FOLDER, $name);
if (file_exists($filepath)) {
$this->availableFiles[$name] = $filepath;
}
}
$this->loopOverXmlFiles();
}
public function loopOverXmlFiles() : void
{
foreach ($this->availableFiles as $name => $filepath) {
$this->importBook($name, $filepath);
}
$this->writeln("DONE");
}
/**
* @param string $bookTitle
* @param string $filepath
* @return Book
*/
public function importBook(string $bookTitle, string $filepath) : Book
{
$this->currentBookNumber = $this->parseBookNumberFromFilepath($filepath);
$this->write("Importing $bookTitle from $filepath Chapters: ", false);
$this->document = $this->openBook($filepath);
$book = Book::create([
'id' => $this->prepareBookId($bookTitle),
'name' => $this->prepareBookName($bookTitle),
]);
$chapterCollection = $this->document->getElementsByTagName('chapter');
foreach ($chapterCollection as $chapterElem) {
$chapter = $this->importChapter($chapterElem, $book->id);
$book->chapters()->save($chapter);
}
$this->write("Done.", true);
return $book;
}
/**
* @param DOMElement $chapterElem
* @return Chapter
*/
public function importChapter(DOMElement $chapterElem, string $book_id) : Chapter
{
$chapterName = $this->parseChapterName($chapterElem);
$this->currentChapterNumber = $chapterName;
$this->write($chapterName . ".", false);
$chapter = Chapter::create([
'id' => implode('-', [$book_id, $chapterName]),
'name' => $chapterName,
]);
$verseCollection = $chapterElem->getElementsByTagName('verse');
foreach ($verseCollection as $verseElem) {
$verse = $this->importVerse($verseElem, $chapter->id);
$chapter->verses()->save($verse);
}
return $chapter;
}
// *** Semi-pseudo code
// Build all the subs for a verse
// Load all the subs for a verse XML element
/**
* Contributed by Tim Maggio and adapted by Leonard Smith
*
* @param $verseElement
*
* @NOTE: May not need the following method as it makes more sense to handle the subs
* closer to the importWord logic. That way, we can save a copy of the Word object and
* have access to the saved id that can then replace the sub number in the ULB text. That
* gives us a substitution pattern that refers to the actual words table's id rather than
* a number that is only relative to the respective verse. Theoretically, this makes our
* data more robust and less prone to errors down the road.
*/
private function buildVerseSubs( $verseElement )
{
// Get all elements matching: <w ... sub="[x]" ... >replace</w>
// *** unknown function/args and I can't remember XML QPath syntax
//
// NOTE from LRS: xpath doesn't help in this instance as it only operates on
// complete XML documents (from what I can tell). Therefore, the simplest, or most direct,
// route is to iterate over the elements ourselves.
$subelements = $this->getElementsByAttribute($verseElement->getElementsByTagName("w"), "sub");
// Initialize the Associative array container for all subs in this verse
// It"s zeroed out each verse and populated if there are any subs
$this->subs = array();
// For each <w sub> element found..
foreach ( $subelements as $sel )
{
// Get the "number" for this sub. 1, 2, 3... digit subs work fine
$subnumber = $sel->getAttribute( "sub" );
// The value of this <w> element is what we will replace with
$replace = $verseElement->nodeValue;
// Remember the replacement string by sub number
$this->subs[ $subnumber ] = $replace;
}
}
// *** Semi-pseudo code
// Build all the phrases for a verse
// Load all the phrases for a verse XML element
// Adds a pseudo-word for the phraseWords
// Adds a phraseid for all words
// Absence of a phraseid for a word means it's just a word
/**
* Contributed by Tim Maggio and adapted by Leonard Smith
*
* @param DOMElement $verseElement
*/
private function buildVersePhrases( DOMElement $verseElement )
{
$phraselist = $verseElement->getElementsByTagName( "phrase" );
// Phrase ID is a simple numeric increment.
// No need to be unique across verses.
$phraseid = 1;
// Go thru all the XML phrase elements in a verse
foreach ( $phraselist as $phraseEl )
{
// Get the next sibling
$nextSibling = $phraseEl->nextSibling;
// Get the ULB phrase words
/** @var DOMElement $phrasewords */
$phrasewords = $phraseEl->getElementsByTagName( "phraseWords" )[0];
// * * * * * * * * * * * * * * *
// Add a pseudo-word element into the verse containing the ULB phrase words
$newEl = $this->document->createElement('w', $phrasewords->nodeValue);
$newEl->setAttribute('phraseId', $phraseid);
$newEl->setAttribute('strongs', -1);
if ($nextSibling === null) {
// We are at the end of a verse, so we can add it to the end
$verseElement->appendChild($newEl);
} else {
$verseElement->insertBefore($newEl, $nextSibling);
}
// Get an object of all words in this phrase
/** @var DOMNodeList $words */
$words = $phraseEl->getElementsByTagName( "w" );
// Go thru each word in the phrase
/**
* @var DOMElement $wordElem
*/
foreach ( iterator_to_array($words) as $wordElem )
{
// Add the Phrase ID to the XML DOM for this word
$wordElem->setAttribute('phraseId', $phraseid);
// *** this is a tricky thing. The original <w> element can stays
// A new element is promoted up 1 level and retains the Phrase ID
$verseElement->insertBefore( $wordElem, $newEl );
}
$phraseid++;
}
}
/**
* @param DOMElement $verseElem
* @return Verse
*/
public function importVerse(DOMElement $verseElem, string $chapter_id) : Verse
{
$verseNumber = $this->parseVerseNumber($verseElem);
$this->currentVerseNumber = (int) $verseNumber;
// Build all phrases in this verse
// ***
$this->buildVersePhrases( $verseElem );
$verse = Verse::create([
'id' => implode('-', [$chapter_id, $verseNumber]),
'name' => $verseNumber,
'greek_text' => $verseElem->getElementsByTagName('Greek')[0]->nodeValue,
'ulb_text' => $verseElem->getElementsByTagName('ULB')[0]->nodeValue,
]);
// ***
// At this time:
// all phrases have been compiled
// a pseudo-word is in the verseElem
// words in a phrase promoted up 1 level
// This should now be all the words required in this verse
$wordCollection = $verseElem->getElementsByTagName('w');
$this->currentWordNumber = 0; // reset the word counter
foreach ($wordCollection as $wordElem) {
$word = $this->importWord($wordElem, $verse->id);
$verse->words()->save($word);
}
return $verse;
}
// *** Semi-pseudo code
// Replace the sub word in a ULB text translation
// The guts of this method were tested in a stand-alone PHP
// * * * * * *
// This might be language based. A pluggable substitution method
// might be needed to get this job done right.
// Logic for this substitution arguably belongs on the back end
// The original and substituted text could be saved in the database
// * * * * * *
// Every word gets a function call and a REGEX match test
// If this gets expensive, could do the preg_match in the calling function
// I did it this way so it's cleaner and logic is self-contained
private function replaceSub( $wordElem )
{
$result = $wordElem->nodeValue;
// Check if anything needs replacing in this "phrase" (it's not really a word)
// Not much of a regex guru. This only matches the first sub to replace
if ( preg_match_all( "#\[\d+\]#", $result, $found ) )
{
foreach ($found[0] as $key) {
// OK, now let's see if we have a sub that matches whats in this phrase
if ( array_key_exists( $key, $this->subs ) )
{
// Grab the word object. Not actually necessary but aids readability
/** @var Word $word */
$word = $this->subs[ $key ];
// Do the actual sub replacement in the phrase
$result = str_replace( $key, "[$word->id]", $result );
}
// Strange.. the <w> node value has a [X] in it, but no sub match
else
{
// Put "??" into the phrase to indicate there's a problem
$result = str_replace( $key, "({$key} ??)", $result );
}
}
}
return $result;
}
/**
* @param DOMElement $wordElem
* @return Word
*/
public function importWord(DOMElement $wordElem, string $verse_id) : Word
{
$ognt_sort = $wordElem->getAttribute('OGNTsort');
// Are we dealing with an element with a sub attribute?
$sub = $wordElem->getAttribute('sub');
// Does our content require a substitution?
$ulb = $this->replaceSub($this->stripUsfmMarkupFromUlb($wordElem));
$word = Word::create([
'verse_code' => implode('-', [$verse_id, $ognt_sort]),
'verse_id' => $verse_id,
// 'ulb' => $wordElem->nodeValue,
'ulb' => $ulb,
'phrase_id' => $wordElem->hasAttribute('phraseId') ? $wordElem->getAttribute('phraseId') : null, // *** from buildVersePhrases()
'sub' => !empty($sub),
'greek' => $wordElem->getAttribute('text'),
'lemma' => $wordElem->getAttribute('lemma'),
'morph' => $wordElem->getAttribute('morph'),
'ognt_sort' => empty($ognt_sort) ? 0 : $ognt_sort,
'strongs_number' => $this->formatStrongsNumber($wordElem->getAttribute('strongs')),
'ulb_sort' => $this->getUlbSortNumber(),
]);
// Save this word object to the subs stack so that we can grab it when we need it.
if (!empty($sub)) {
$this->subs[$sub] = $word;
}
return $word;
}
/**
* @param DOMNodeList $elements
* @param string $attribute
* @return array
*/
private function getElementsByAttribute(DOMNodeList $elements, string $attribute) : array
{
$results = [];
/**
* @var DOMElement $el
*/
foreach ($elements as $el) {
if ($el->hasAttribute($attribute)) {
$results[] = $el;
}
}
return $results;
}
/**
* @param DOMElement $element
* @return DOMElement
*/
protected function stripUsfmMarkupFromUlb(DOMElement $element) : DOMElement
{
$usfmNodes = $element->getElementsByTagName('usfm');
if (!empty($usfmNodes)) {
foreach ($usfmNodes as $node) {
$element->removeChild($node);
}
}
return $element;
}
/**
* @return int
*/
protected function getUlbSortNumber() : int
{
$chapterNumber = sprintf("%02d", $this->currentChapterNumber);
$verseNumber = sprintf("%03d", $this->currentVerseNumber);
$wordNumber = sprintf("%03d", $this->currentWordNumber++);
return $this->currentBookNumber . $chapterNumber . $verseNumber . $wordNumber;
}
/**
* Not all of the strong's numbers coming from the XML files are formatted the same. Let's fix that here.
*
* @param string $strongsNumber
* @return mixed
*/
protected function formatStrongsNumber(string $strongsNumber) : ?string
{
if ($strongsNumber == -1) {
return $strongsNumber;
} elseif (empty($strongsNumber)) {
return null;
}
return 'G' . ltrim($strongsNumber, "Gg");
}
/**
* @param DOMElement $chapterElem
* @return string
*/
protected function parseChapterName(DOMElement $chapterElem) : string
{
$osisId = $chapterElem->getAttribute('osisID');
preg_match("|^[A-Za-z1-3]*.([0-9]*)$|", $osisId, $matches);
return $matches[1];
}
/**
* @param DOMElement $verseElem
* @return int
*/
protected function parseVerseNumber(DOMElement $verseElem) : int
{
$string = $verseElem->getAttribute('name');
preg_match("|^[A-Za-z1-3 ]*\s*[0-9]*:([0-9]*)$|", $string, $matches);
return $matches[1];
}
/**
* @param string $filepath
* @return int
*/
protected function parseBookNumberFromFilepath(string $filepath) : int
{
$filename = pathinfo($filepath, PATHINFO_FILENAME);
return substr($filename, 0, 2);
}
/**
* @param string $filepath
* @return DOMDocument
*/
protected function openBook(string $filepath) : DOMDocument
{
$document = new \DOMDocument;
$document->load($filepath);
return $document;
}
/**
* @param string $message
* @param bool $newline
*/
protected function write(string $message, bool $newline) : void
{
if ($this->output !== null && $this->output instanceof OutputStyle) {
$this->output->write("<info>$message</info>", $newline, OutputInterface::VERBOSITY_NORMAL);
}
}
/**
* @param string $message
*/
protected function writeln(string $message) : void
{
if ($this->output !== null && $this->output instanceof OutputStyle) {
$this->output->writeln("<info>$message</info>", OutputInterface::VERBOSITY_NORMAL);
}
}
}