en_btr_backend/app/Handlers/UlbXmlImportHandler.php

<?php
namespace App\Handlers;

use App\Book;
use App\Chapter;
use App\Helpers\Traits\BookTitleHelperTrait;
use App\Word;
use App\Verse;
use DOMDocument;
use DOMElement;
use DOMNodeList;
use Illuminate\Console\OutputStyle;
use Symfony\Component\Console\Output\OutputInterface;

/**
 * UlbXmlImportHandler.php
 *
 * @author: Leonard Smith <leonard@acornwebconsultants.com>
 * Date: 10/2/20
 * Time: 8:04 AM
 */
class UlbXmlImportHandler
{
    use BookXmlFilesTrait, BookTitleHelperTrait;

    const REPO_FOLDER = '/ulb/';

    /**
     * @var DOMDocument
     */
    protected $document;

    /**
     * @var int
     */
    protected $currentGNTSort = 0;

    /**
     * @var int
     */
    protected $currentULBSort = 0;

    /**
     * The following four integer variables provide state tracking
     * for building ULB and GNT sort data. Instead of relying solely on
     * OGNTSort which is pulled from the OpenGNT data sources, we need
     * to create our own sort indices so that the we can resort data
     * on the fly.
     *
     * @var int
     */
    protected $currentBookNumber = 0;

    /**
     * @var int
     */
    protected $currentChapterNumber = 0;

    /**
     * @var int
     */
    protected $currentVerseNumber = 0;

    /**
     * @var int
     */
    protected $currentWordNumber = 0;

    /**
     * @var OutputStyle
     */
    protected $output;

    /**
     * @var array
     */
    protected $availableFiles = [];

    public function __construct(OutputStyle $output = null)
    {
        $this->output = $output;
    }

    public function run() : void
    {
        // Collect available xml files
        foreach (self::$bookXmlFiles as $name => $filename)
        {
            $filepath = $this->getBookXmlFilePath(self::REPO_FOLDER, $name);

            if (file_exists($filepath)) {
                $this->availableFiles[$name] = $filepath;
            }
        }

        $this->loopOverXmlFiles();
    }

    public function loopOverXmlFiles() : void
    {
        foreach ($this->availableFiles as $name => $filepath) {
            $this->importBook($name, $filepath);
        }
        $this->writeln("DONE");
    }

    /**
     * @param string $bookTitle
     * @param string $filepath
     * @return Book
     */
    public function importBook(string $bookTitle, string $filepath) : Book
    {
        $this->currentBookNumber = $this->parseBookNumberFromFilepath($filepath);

        $this->write("Importing $bookTitle from $filepath Chapters: ", false);

        $this->document = $this->openBook($filepath);

        $book = Book::create([
            'id' => $this->prepareBookId($bookTitle),
            'name' => $this->prepareBookName($bookTitle),
        ]);

        $chapterCollection = $this->document->getElementsByTagName('chapter');

        foreach ($chapterCollection as $chapterElem) {
            $chapter = $this->importChapter($chapterElem, $book->id);
            $book->chapters()->save($chapter);
        }

        $this->write("Done.", true);

        return $book;
    }

    /**
     * @param DOMElement $chapterElem
     * @return Chapter
     */
    public function importChapter(DOMElement $chapterElem, string $book_id) : Chapter
    {
        $chapterName = $this->parseChapterName($chapterElem);
        $this->currentChapterNumber = $chapterName;

        $this->write($chapterName . ".", false);

        $chapter = Chapter::create([
            'id' => implode('-', [$book_id, $chapterName]),
            'name' => $chapterName,
        ]);

        $verseCollection = $chapterElem->getElementsByTagName('verse');

        foreach ($verseCollection as $verseElem) {
            $verse = $this->importVerse($verseElem, $chapter->id);
            $chapter->verses()->save($verse);
        }

        return $chapter;
    }

    // *** Semi-pseudo code
    // Build all the subs for a verse
    // Load all the subs for a verse XML element
    /**
     * Contributed by Tim Maggio and adapted by Leonard Smith
     *
     * @param $verseElement
     *
     * @NOTE: May not need the following method as it makes more sense to handle the subs
     * closer to the importWord logic. That way, we can save a copy of the Word object and
     * have access to the saved id that can then replace the sub number in the ULB text. That
     * gives us a substitution pattern that refers to the actual words table's id rather than
     * a number that is only relative to the respective verse. Theoretically, this makes our
     * data more robust and less prone to errors down the road.
     */
    private function buildVerseSubs( $verseElement )
    {
        // Get all elements matching: <w ... sub="[x]" ... >replace</w>
        // *** unknown function/args and I can't remember XML QPath syntax
        //
        // NOTE from LRS: xpath doesn't help in this instance as it only operates on
        // complete XML documents (from what I can tell). Therefore, the simplest, or most direct,
        // route is to iterate over the elements ourselves.
        $subelements = $this->getElementsByAttribute($verseElement->getElementsByTagName("w"), "sub");

        // Initialize the Associative array container for all subs in this verse
        // It"s zeroed out each verse and populated if there are any subs
        $this->subs = array();

        // For each <w sub> element found..
        foreach ( $subelements as $sel )
        {
            // Get the "number" for this sub. 1, 2, 3... digit subs work fine
            $subnumber = $sel->getAttribute( "sub" );

            // The value of this <w> element is what we will replace with
            $replace   = $verseElement->nodeValue;

            // Remember the replacement string by sub number
            $this->subs[ $subnumber ] = $replace;
        }
    }

    // *** Semi-pseudo code
    // Build all the phrases for a verse
    // Load all the phrases for a verse XML element
    // Adds a pseudo-word for the phraseWords
    // Adds a phraseid for all words
    // Absence of a phraseid for a word means it's just a word
    /**
     * Contributed by Tim Maggio and adapted by Leonard Smith
     *
     * @param DOMElement $verseElement
     */
    private function buildVersePhrases( DOMElement $verseElement )
    {
        $phraselist = $verseElement->getElementsByTagName( "phrase" );

        // Phrase ID is a simple numeric increment.
        // No need to be unique across verses.
        $phraseid = 1;

        // Go thru all the XML phrase elements in a verse
        foreach ( $phraselist as $phraseEl )
        {
            // Get the next sibling
            $nextSibling = $phraseEl->nextSibling;

            // Get the ULB phrase words
            /** @var DOMElement $phrasewords */
            $phrasewords = $phraseEl->getElementsByTagName( "phraseWords" )[0];

            // * * * * * * * * * * * * * * *
            // Add a pseudo-word element into the verse containing the ULB phrase words
            $newEl = $this->document->createElement('w', $phrasewords->nodeValue);
            $newEl->setAttribute('phraseId', $phraseid);
            $newEl->setAttribute('strongs', -1);
            if ($nextSibling === null) {
                // We are at the end of a verse, so we can add it to the end
                $verseElement->appendChild($newEl);
            } else {
                $verseElement->insertBefore($newEl, $nextSibling);
            }

            // Get an object of all words in this phrase
            /** @var DOMNodeList $words */
            $words = $phraseEl->getElementsByTagName( "w" );

            // Go thru each word in the phrase
            /**
             * @var DOMElement $wordElem
             */
            foreach ( iterator_to_array($words) as $wordElem )
            {
                // Add the Phrase ID to the XML DOM for this word
                $wordElem->setAttribute('phraseId', $phraseid);

                // *** this is a tricky thing. The original <w> element can stays
                // A new element is promoted up 1 level and retains the Phrase ID
                $verseElement->insertBefore( $wordElem, $newEl );
            }

            $phraseid++;
        }
    }

    /**
     * @param DOMElement $verseElem
     * @return Verse
     */
    public function importVerse(DOMElement $verseElem, string $chapter_id) : Verse
    {
        $verseNumber = $this->parseVerseNumber($verseElem);
        $this->currentVerseNumber = (int) $verseNumber;

        // Build all phrases in this verse
        // ***
        $this->buildVersePhrases( $verseElem );

        $verse = Verse::create([
            'id' => implode('-', [$chapter_id, $verseNumber]),
            'name' => $verseNumber,
            'greek_text' => $verseElem->getElementsByTagName('Greek')[0]->nodeValue,
            'ulb_text' => $verseElem->getElementsByTagName('ULB')[0]->nodeValue,
        ]);

        // ***
        // At this time:
        //      all phrases have been compiled
        //          a pseudo-word is in the verseElem
        //          words in a phrase promoted up 1 level

        // This should now be all the words required in this verse
        $wordCollection = $verseElem->getElementsByTagName('w');
        $this->currentWordNumber = 0; // reset the word counter
        foreach ($wordCollection as $wordElem) {
            $word = $this->importWord($wordElem, $verse->id);
            $verse->words()->save($word);
        }

        return $verse;
    }

    // *** Semi-pseudo code
    // Replace the sub word in a ULB text translation
    // The guts of this method were tested in a stand-alone PHP
    // * * * * * *
    // This might be language based. A pluggable substitution method
    // might be needed to get this job done right.
    // Logic for this substitution arguably belongs on the back end
    // The original and substituted text could be saved in the database
    // * * * * * *
    // Every word gets a function call and a REGEX match test
    // If this gets expensive, could do the preg_match in the calling function
    // I did it this way so it's cleaner and logic is self-contained
    private function replaceSub( $wordElem )
    {
        $result = $wordElem->nodeValue;

        // Check if anything needs replacing in this "phrase" (it's not really a word)
        // Not much of a regex guru. This only matches the first sub to replace
        if ( preg_match_all( "#\[\d+\]#", $result, $found ) )
        {
            foreach ($found[0] as $key) {
                // OK, now let's see if we have a sub that matches whats in this phrase
                if ( array_key_exists( $key, $this->subs ) )
                {
                    // Grab the word object. Not actually necessary but aids readability
                    /** @var Word $word */
                    $word = $this->subs[ $key ];

                    // Do the actual sub replacement in the phrase
                    $result = str_replace( $key, "[$word->id]", $result );
                }
                // Strange.. the <w> node value has a [X] in it, but no sub match
                else
                {
                    // Put "??" into the phrase to indicate there's a problem
                    $result = str_replace( $key, "({$key} ??)", $result );
                }
            }
        }
        return $result;
    }

    /**
     * @param DOMElement $wordElem
     * @return Word
     */
    public function importWord(DOMElement $wordElem, string $verse_id) : Word
    {
        $ognt_sort = $wordElem->getAttribute('OGNTsort');

        // Are we dealing with an element with a sub attribute?
        $sub = $wordElem->getAttribute('sub');

        // Does our content require a substitution?
        $ulb = $this->replaceSub($this->stripUsfmMarkupFromUlb($wordElem));

        $word = Word::create([
            'verse_code' => implode('-', [$verse_id, $ognt_sort]),
            'verse_id' => $verse_id,
//            'ulb' => $wordElem->nodeValue,
            'ulb' => $ulb,
            'phrase_id' => $wordElem->hasAttribute('phraseId') ? $wordElem->getAttribute('phraseId') : null,    // *** from buildVersePhrases()
            'sub' => !empty($sub),
            'greek' => $wordElem->getAttribute('text'),
            'lemma' =>  $wordElem->getAttribute('lemma'),
            'morph' => $wordElem->getAttribute('morph'),
            'ognt_sort' => empty($ognt_sort) ? 0 : $ognt_sort,
            'strongs_number' => $this->formatStrongsNumber($wordElem->getAttribute('strongs')),
            'ulb_sort' => $this->getUlbSortNumber(),
        ]);

        // Save this word object to the subs stack so that we can grab it when we need it.
        if (!empty($sub)) {
            $this->subs[$sub] = $word;
        }

        return $word;
    }

    /**
     * @param DOMNodeList $elements
     * @param string $attribute
     * @return array
     */
    private function getElementsByAttribute(DOMNodeList $elements, string $attribute) : array
    {
        $results = [];
        /**
         * @var DOMElement $el
         */
        foreach ($elements as $el) {
            if ($el->hasAttribute($attribute)) {
                $results[] = $el;
            }
        }
        return $results;
    }

    /**
     * @param DOMElement $element
     * @return DOMElement
     */
    protected function stripUsfmMarkupFromUlb(DOMElement $element) : DOMElement
    {
        $usfmNodes = $element->getElementsByTagName('usfm');
        if (!empty($usfmNodes)) {
            foreach ($usfmNodes as $node) {
                $element->removeChild($node);
            }
        }
        return $element;
    }

    /**
     * @return int
     */
    protected function getUlbSortNumber() : int
    {
        $chapterNumber = sprintf("%02d", $this->currentChapterNumber);
        $verseNumber = sprintf("%03d", $this->currentVerseNumber);
        $wordNumber = sprintf("%03d", $this->currentWordNumber++);

        return $this->currentBookNumber . $chapterNumber . $verseNumber . $wordNumber;
    }

    /**
     * Not all of the strong's numbers coming from the XML files are formatted the same. Let's fix that here.
     *
     * @param string $strongsNumber
     * @return mixed
     */
    protected function formatStrongsNumber(string $strongsNumber) : ?string
    {
        if ($strongsNumber == -1) {
            return $strongsNumber;
        } elseif (empty($strongsNumber)) {
            return null;
        }
        return 'G' . ltrim($strongsNumber, "Gg");
    }

    /**
     * @param DOMElement $chapterElem
     * @return string
     */
    protected function parseChapterName(DOMElement $chapterElem) : string
    {
        $osisId = $chapterElem->getAttribute('osisID');

        preg_match("|^[A-Za-z1-3]*.([0-9]*)$|", $osisId, $matches);

        return $matches[1];
    }

    /**
     * @param DOMElement $verseElem
     * @return int
     */
    protected function parseVerseNumber(DOMElement $verseElem) : int
    {
        $string = $verseElem->getAttribute('name');
        preg_match("|^[A-Za-z1-3 ]*\s*[0-9]*:([0-9]*)$|", $string, $matches);

        return $matches[1];
    }

    /**
     * @param string $filepath
     * @return int
     */
    protected function parseBookNumberFromFilepath(string $filepath) : int
    {
        $filename = pathinfo($filepath, PATHINFO_FILENAME);
        return substr($filename, 0, 2);
    }

    /**
     * @param string $filepath
     * @return DOMDocument
     */
    protected function openBook(string $filepath) : DOMDocument
    {
        $document = new \DOMDocument;
        $document->load($filepath);

        return $document;
    }

    /**
     * @param string $message
     * @param bool $newline
     */
    protected function write(string $message, bool $newline) : void
    {
        if ($this->output !== null && $this->output instanceof OutputStyle) {
            $this->output->write("<info>$message</info>", $newline, OutputInterface::VERBOSITY_NORMAL);
        }
    }

    /**
     * @param string $message
     */
    protected function writeln(string $message) : void
    {
        if ($this->output !== null && $this->output instanceof OutputStyle) {
            $this->output->writeln("<info>$message</info>", OutputInterface::VERBOSITY_NORMAL);
        }
    }
}