515 lines
16 KiB
PHP
515 lines
16 KiB
PHP
<?php
|
|
namespace App\Handlers;
|
|
|
|
use App\Book;
|
|
use App\Chapter;
|
|
use App\Helpers\Traits\BookTitleHelperTrait;
|
|
use App\Word;
|
|
use App\Verse;
|
|
use DOMDocument;
|
|
use DOMElement;
|
|
use DOMNodeList;
|
|
use Illuminate\Console\OutputStyle;
|
|
use Symfony\Component\Console\Output\OutputInterface;
|
|
|
|
/**
|
|
* UlbXmlImportHandler.php
|
|
*
|
|
* @author: Leonard Smith <leonard@acornwebconsultants.com>
|
|
* Date: 10/2/20
|
|
* Time: 8:04 AM
|
|
*/
|
|
class UlbXmlImportHandler
|
|
{
|
|
use BookXmlFilesTrait, BookTitleHelperTrait;
|
|
|
|
const REPO_FOLDER = '/ulb/';
|
|
|
|
/**
|
|
* @var DOMDocument
|
|
*/
|
|
protected $document;
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $currentGNTSort = 0;
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $currentULBSort = 0;
|
|
|
|
/**
|
|
* The following four integer variables provide state tracking
|
|
* for building ULB and GNT sort data. Instead of relying solely on
|
|
* OGNTSort which is pulled from the OpenGNT data sources, we need
|
|
* to create our own sort indices so that the we can resort data
|
|
* on the fly.
|
|
*
|
|
* @var int
|
|
*/
|
|
protected $currentBookNumber = 0;
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $currentChapterNumber = 0;
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $currentVerseNumber = 0;
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
protected $currentWordNumber = 0;
|
|
|
|
/**
|
|
* @var OutputStyle
|
|
*/
|
|
protected $output;
|
|
|
|
/**
|
|
* @var array
|
|
*/
|
|
protected $availableFiles = [];
|
|
|
|
public function __construct(OutputStyle $output = null)
|
|
{
|
|
$this->output = $output;
|
|
}
|
|
|
|
public function run() : void
|
|
{
|
|
// Collect available xml files
|
|
foreach (self::$bookXmlFiles as $name => $filename)
|
|
{
|
|
$filepath = $this->getBookXmlFilePath(self::REPO_FOLDER, $name);
|
|
|
|
if (file_exists($filepath)) {
|
|
$this->availableFiles[$name] = $filepath;
|
|
}
|
|
}
|
|
|
|
$this->loopOverXmlFiles();
|
|
}
|
|
|
|
public function loopOverXmlFiles() : void
|
|
{
|
|
foreach ($this->availableFiles as $name => $filepath) {
|
|
$this->importBook($name, $filepath);
|
|
}
|
|
$this->writeln("DONE");
|
|
}
|
|
|
|
/**
|
|
* @param string $bookTitle
|
|
* @param string $filepath
|
|
* @return Book
|
|
*/
|
|
public function importBook(string $bookTitle, string $filepath) : Book
|
|
{
|
|
$this->currentBookNumber = $this->parseBookNumberFromFilepath($filepath);
|
|
|
|
$this->write("Importing $bookTitle from $filepath Chapters: ", false);
|
|
|
|
$this->document = $this->openBook($filepath);
|
|
|
|
$book = Book::create([
|
|
'id' => $this->prepareBookId($bookTitle),
|
|
'name' => $this->prepareBookName($bookTitle),
|
|
]);
|
|
|
|
$chapterCollection = $this->document->getElementsByTagName('chapter');
|
|
|
|
foreach ($chapterCollection as $chapterElem) {
|
|
$chapter = $this->importChapter($chapterElem, $book->id);
|
|
$book->chapters()->save($chapter);
|
|
}
|
|
|
|
$this->write("Done.", true);
|
|
|
|
return $book;
|
|
}
|
|
|
|
/**
|
|
* @param DOMElement $chapterElem
|
|
* @return Chapter
|
|
*/
|
|
public function importChapter(DOMElement $chapterElem, string $book_id) : Chapter
|
|
{
|
|
$chapterName = $this->parseChapterName($chapterElem);
|
|
$this->currentChapterNumber = $chapterName;
|
|
|
|
$this->write($chapterName . ".", false);
|
|
|
|
$chapter = Chapter::create([
|
|
'id' => implode('-', [$book_id, $chapterName]),
|
|
'name' => $chapterName,
|
|
]);
|
|
|
|
$verseCollection = $chapterElem->getElementsByTagName('verse');
|
|
|
|
foreach ($verseCollection as $verseElem) {
|
|
$verse = $this->importVerse($verseElem, $chapter->id);
|
|
$chapter->verses()->save($verse);
|
|
}
|
|
|
|
return $chapter;
|
|
}
|
|
|
|
// *** Semi-pseudo code
|
|
// Build all the subs for a verse
|
|
// Load all the subs for a verse XML element
|
|
/**
|
|
* Contributed by Tim Maggio and adapted by Leonard Smith
|
|
*
|
|
* @param $verseElement
|
|
*
|
|
* @NOTE: May not need the following method as it makes more sense to handle the subs
|
|
* closer to the importWord logic. That way, we can save a copy of the Word object and
|
|
* have access to the saved id that can then replace the sub number in the ULB text. That
|
|
* gives us a substitution pattern that refers to the actual words table's id rather than
|
|
* a number that is only relative to the respective verse. Theoretically, this makes our
|
|
* data more robust and less prone to errors down the road.
|
|
*/
|
|
private function buildVerseSubs( $verseElement )
|
|
{
|
|
// Get all elements matching: <w ... sub="[x]" ... >replace</w>
|
|
// *** unknown function/args and I can't remember XML QPath syntax
|
|
//
|
|
// NOTE from LRS: xpath doesn't help in this instance as it only operates on
|
|
// complete XML documents (from what I can tell). Therefore, the simplest, or most direct,
|
|
// route is to iterate over the elements ourselves.
|
|
$subelements = $this->getElementsByAttribute($verseElement->getElementsByTagName("w"), "sub");
|
|
|
|
// Initialize the Associative array container for all subs in this verse
|
|
// It"s zeroed out each verse and populated if there are any subs
|
|
$this->subs = array();
|
|
|
|
// For each <w sub> element found..
|
|
foreach ( $subelements as $sel )
|
|
{
|
|
// Get the "number" for this sub. 1, 2, 3... digit subs work fine
|
|
$subnumber = $sel->getAttribute( "sub" );
|
|
|
|
// The value of this <w> element is what we will replace with
|
|
$replace = $verseElement->nodeValue;
|
|
|
|
// Remember the replacement string by sub number
|
|
$this->subs[ $subnumber ] = $replace;
|
|
}
|
|
}
|
|
|
|
// *** Semi-pseudo code
|
|
// Build all the phrases for a verse
|
|
// Load all the phrases for a verse XML element
|
|
// Adds a pseudo-word for the phraseWords
|
|
// Adds a phraseid for all words
|
|
// Absence of a phraseid for a word means it's just a word
|
|
/**
|
|
* Contributed by Tim Maggio and adapted by Leonard Smith
|
|
*
|
|
* @param DOMElement $verseElement
|
|
*/
|
|
private function buildVersePhrases( DOMElement $verseElement )
|
|
{
|
|
$phraselist = $verseElement->getElementsByTagName( "phrase" );
|
|
|
|
// Phrase ID is a simple numeric increment.
|
|
// No need to be unique across verses.
|
|
$phraseid = 1;
|
|
|
|
// Go thru all the XML phrase elements in a verse
|
|
foreach ( $phraselist as $phraseEl )
|
|
{
|
|
// Get the next sibling
|
|
$nextSibling = $phraseEl->nextSibling;
|
|
|
|
// Get the ULB phrase words
|
|
/** @var DOMElement $phrasewords */
|
|
$phrasewords = $phraseEl->getElementsByTagName( "phraseWords" )[0];
|
|
|
|
// * * * * * * * * * * * * * * *
|
|
// Add a pseudo-word element into the verse containing the ULB phrase words
|
|
$newEl = $this->document->createElement('w', $phrasewords->nodeValue);
|
|
$newEl->setAttribute('phraseId', $phraseid);
|
|
$newEl->setAttribute('strongs', -1);
|
|
if ($nextSibling === null) {
|
|
// We are at the end of a verse, so we can add it to the end
|
|
$verseElement->appendChild($newEl);
|
|
} else {
|
|
$verseElement->insertBefore($newEl, $nextSibling);
|
|
}
|
|
|
|
// Get an object of all words in this phrase
|
|
/** @var DOMNodeList $words */
|
|
$words = $phraseEl->getElementsByTagName( "w" );
|
|
|
|
// Go thru each word in the phrase
|
|
/**
|
|
* @var DOMElement $wordElem
|
|
*/
|
|
foreach ( iterator_to_array($words) as $wordElem )
|
|
{
|
|
// Add the Phrase ID to the XML DOM for this word
|
|
$wordElem->setAttribute('phraseId', $phraseid);
|
|
|
|
// *** this is a tricky thing. The original <w> element can stays
|
|
// A new element is promoted up 1 level and retains the Phrase ID
|
|
$verseElement->insertBefore( $wordElem, $newEl );
|
|
}
|
|
|
|
$phraseid++;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param DOMElement $verseElem
|
|
* @return Verse
|
|
*/
|
|
public function importVerse(DOMElement $verseElem, string $chapter_id) : Verse
|
|
{
|
|
$verseNumber = $this->parseVerseNumber($verseElem);
|
|
$this->currentVerseNumber = (int) $verseNumber;
|
|
|
|
// Build all phrases in this verse
|
|
// ***
|
|
$this->buildVersePhrases( $verseElem );
|
|
|
|
$verse = Verse::create([
|
|
'id' => implode('-', [$chapter_id, $verseNumber]),
|
|
'name' => $verseNumber,
|
|
'greek_text' => $verseElem->getElementsByTagName('Greek')[0]->nodeValue,
|
|
'ulb_text' => $verseElem->getElementsByTagName('ULB')[0]->nodeValue,
|
|
]);
|
|
|
|
// ***
|
|
// At this time:
|
|
// all phrases have been compiled
|
|
// a pseudo-word is in the verseElem
|
|
// words in a phrase promoted up 1 level
|
|
|
|
// This should now be all the words required in this verse
|
|
$wordCollection = $verseElem->getElementsByTagName('w');
|
|
$this->currentWordNumber = 0; // reset the word counter
|
|
foreach ($wordCollection as $wordElem) {
|
|
$word = $this->importWord($wordElem, $verse->id);
|
|
$verse->words()->save($word);
|
|
}
|
|
|
|
return $verse;
|
|
}
|
|
|
|
// *** Semi-pseudo code
|
|
// Replace the sub word in a ULB text translation
|
|
// The guts of this method were tested in a stand-alone PHP
|
|
// * * * * * *
|
|
// This might be language based. A pluggable substitution method
|
|
// might be needed to get this job done right.
|
|
// Logic for this substitution arguably belongs on the back end
|
|
// The original and substituted text could be saved in the database
|
|
// * * * * * *
|
|
// Every word gets a function call and a REGEX match test
|
|
// If this gets expensive, could do the preg_match in the calling function
|
|
// I did it this way so it's cleaner and logic is self-contained
|
|
private function replaceSub( $wordElem )
|
|
{
|
|
$result = $wordElem->nodeValue;
|
|
|
|
// Check if anything needs replacing in this "phrase" (it's not really a word)
|
|
// Not much of a regex guru. This only matches the first sub to replace
|
|
if ( preg_match_all( "#\[\d+\]#", $result, $found ) )
|
|
{
|
|
foreach ($found[0] as $key) {
|
|
// OK, now let's see if we have a sub that matches whats in this phrase
|
|
if ( array_key_exists( $key, $this->subs ) )
|
|
{
|
|
// Grab the word object. Not actually necessary but aids readability
|
|
/** @var Word $word */
|
|
$word = $this->subs[ $key ];
|
|
|
|
// Do the actual sub replacement in the phrase
|
|
$result = str_replace( $key, "[$word->id]", $result );
|
|
}
|
|
// Strange.. the <w> node value has a [X] in it, but no sub match
|
|
else
|
|
{
|
|
// Put "??" into the phrase to indicate there's a problem
|
|
$result = str_replace( $key, "({$key} ??)", $result );
|
|
}
|
|
}
|
|
}
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* @param DOMElement $wordElem
|
|
* @return Word
|
|
*/
|
|
public function importWord(DOMElement $wordElem, string $verse_id) : Word
|
|
{
|
|
$ognt_sort = $wordElem->getAttribute('OGNTsort');
|
|
|
|
// Are we dealing with an element with a sub attribute?
|
|
$sub = $wordElem->getAttribute('sub');
|
|
|
|
// Does our content require a substitution?
|
|
$ulb = $this->replaceSub($this->stripUsfmMarkupFromUlb($wordElem));
|
|
|
|
$word = Word::create([
|
|
'verse_code' => implode('-', [$verse_id, $ognt_sort]),
|
|
'verse_id' => $verse_id,
|
|
// 'ulb' => $wordElem->nodeValue,
|
|
'ulb' => $ulb,
|
|
'phrase_id' => $wordElem->hasAttribute('phraseId') ? $wordElem->getAttribute('phraseId') : null, // *** from buildVersePhrases()
|
|
'sub' => !empty($sub),
|
|
'greek' => $wordElem->getAttribute('text'),
|
|
'lemma' => $wordElem->getAttribute('lemma'),
|
|
'morph' => $wordElem->getAttribute('morph'),
|
|
'ognt_sort' => empty($ognt_sort) ? 0 : $ognt_sort,
|
|
'strongs_number' => $this->formatStrongsNumber($wordElem->getAttribute('strongs')),
|
|
'ulb_sort' => $this->getUlbSortNumber(),
|
|
]);
|
|
|
|
// Save this word object to the subs stack so that we can grab it when we need it.
|
|
if (!empty($sub)) {
|
|
$this->subs[$sub] = $word;
|
|
}
|
|
|
|
return $word;
|
|
}
|
|
|
|
/**
|
|
* @param DOMNodeList $elements
|
|
* @param string $attribute
|
|
* @return array
|
|
*/
|
|
private function getElementsByAttribute(DOMNodeList $elements, string $attribute) : array
|
|
{
|
|
$results = [];
|
|
/**
|
|
* @var DOMElement $el
|
|
*/
|
|
foreach ($elements as $el) {
|
|
if ($el->hasAttribute($attribute)) {
|
|
$results[] = $el;
|
|
}
|
|
}
|
|
return $results;
|
|
}
|
|
|
|
/**
|
|
* @param DOMElement $element
|
|
* @return DOMElement
|
|
*/
|
|
protected function stripUsfmMarkupFromUlb(DOMElement $element) : DOMElement
|
|
{
|
|
$usfmNodes = $element->getElementsByTagName('usfm');
|
|
if (!empty($usfmNodes)) {
|
|
foreach ($usfmNodes as $node) {
|
|
$element->removeChild($node);
|
|
}
|
|
}
|
|
return $element;
|
|
}
|
|
|
|
/**
|
|
* @return int
|
|
*/
|
|
protected function getUlbSortNumber() : int
|
|
{
|
|
$chapterNumber = sprintf("%02d", $this->currentChapterNumber);
|
|
$verseNumber = sprintf("%03d", $this->currentVerseNumber);
|
|
$wordNumber = sprintf("%03d", $this->currentWordNumber++);
|
|
|
|
return $this->currentBookNumber . $chapterNumber . $verseNumber . $wordNumber;
|
|
}
|
|
|
|
/**
|
|
* Not all of the strong's numbers coming from the XML files are formatted the same. Let's fix that here.
|
|
*
|
|
* @param string $strongsNumber
|
|
* @return mixed
|
|
*/
|
|
protected function formatStrongsNumber(string $strongsNumber) : ?string
|
|
{
|
|
if ($strongsNumber == -1) {
|
|
return $strongsNumber;
|
|
} elseif (empty($strongsNumber)) {
|
|
return null;
|
|
}
|
|
return 'G' . ltrim($strongsNumber, "Gg");
|
|
}
|
|
|
|
/**
|
|
* @param DOMElement $chapterElem
|
|
* @return string
|
|
*/
|
|
protected function parseChapterName(DOMElement $chapterElem) : string
|
|
{
|
|
$osisId = $chapterElem->getAttribute('osisID');
|
|
|
|
preg_match("|^[A-Za-z1-3]*.([0-9]*)$|", $osisId, $matches);
|
|
|
|
return $matches[1];
|
|
}
|
|
|
|
/**
|
|
* @param DOMElement $verseElem
|
|
* @return int
|
|
*/
|
|
protected function parseVerseNumber(DOMElement $verseElem) : int
|
|
{
|
|
$string = $verseElem->getAttribute('name');
|
|
preg_match("|^[A-Za-z1-3 ]*\s*[0-9]*:([0-9]*)$|", $string, $matches);
|
|
|
|
return $matches[1];
|
|
}
|
|
|
|
/**
|
|
* @param string $filepath
|
|
* @return int
|
|
*/
|
|
protected function parseBookNumberFromFilepath(string $filepath) : int
|
|
{
|
|
$filename = pathinfo($filepath, PATHINFO_FILENAME);
|
|
return substr($filename, 0, 2);
|
|
}
|
|
|
|
/**
|
|
* @param string $filepath
|
|
* @return DOMDocument
|
|
*/
|
|
protected function openBook(string $filepath) : DOMDocument
|
|
{
|
|
$document = new \DOMDocument;
|
|
$document->load($filepath);
|
|
|
|
return $document;
|
|
}
|
|
|
|
/**
|
|
* @param string $message
|
|
* @param bool $newline
|
|
*/
|
|
protected function write(string $message, bool $newline) : void
|
|
{
|
|
if ($this->output !== null && $this->output instanceof OutputStyle) {
|
|
$this->output->write("<info>$message</info>", $newline, OutputInterface::VERBOSITY_NORMAL);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param string $message
|
|
*/
|
|
protected function writeln(string $message) : void
|
|
{
|
|
if ($this->output !== null && $this->output instanceof OutputStyle) {
|
|
$this->output->writeln("<info>$message</info>", OutputInterface::VERBOSITY_NORMAL);
|
|
}
|
|
}
|
|
}
|