Setup ULB XML Importer

This commit is contained in:
Leonard Smith 2020-10-02 11:03:50 -05:00
parent fce5f52300
commit 2750bf115a
5 changed files with 551 additions and 0 deletions

View File

@ -0,0 +1,44 @@
<?php
namespace App\Console\Commands;
use App\Handlers\UlbXmlImportHandler;
use Illuminate\Console\Command;
class ImportUlbXmlData extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'gwt:import-ulb-xml';
/**
* The console command description.
*
* @var string
*/
protected $description = 'Import ULB data from XML file';
/**
* Create a new command instance.
*
* @return void
*/
public function __construct()
{
parent::__construct();
}
/**
* Execute the console command.
*
* @return mixed
*/
public function handle()
{
$importHandler = new UlbXmlImportHandler();
$importHandler->run();
}
}

View File

@ -0,0 +1,49 @@
<?php
namespace App\Handlers;
/**
* BookXmlFilesTrait.php
*
* @author: Leonard Smith <leonard@acornwebconsultants.com>
* Date: 9/19/20
* Time: 10:43 AM
*/
trait BookXmlFilesTrait
{
protected static $bookXmlFiles = [
'matthew' => '41-MAT.xml',
'mark' => '42-MRK.xml',
'luke' => '43-LUK.xml',
'john' => '44-JHN.xml',
'acts' => '45-ACT.xml',
'romans' => '46-ROM.xml',
'1corinthians' => '47-1CO.xml',
'2corinthians' => '48-2CO.xml',
'galatians' => '49-GAL.xml',
'ephesians' => '50-EPH.xml',
'philippians' => '51-PHP.xml',
'colossians' => '52-COL.xml',
'1thessalonians' => '53-1TH.xml',
'2thessalonians' => '54-1TH.xml',
'1timothy' => '55-1TI.xml',
'2timothy' => '56-2TI.xml',
'titus' => '57-TIT.xml',
'philemon' => '58-PHM.xml',
'hebrews' => '59-HEB.xml',
'james' => '60-JAS.xml',
'1peter' => '61-1PE.xml',
'2peter' => '62-2PE.xml',
'1john' => '63-1JN.xml',
'2john' => '64-2JN.xml',
'3john' => '65-3JN.xml',
'jude' => '66-JUD.xml',
'revelation' => '67-REV.xml',
];
public function getBookXmlFilePath($folder, $book)
{
$folder = '/'. trim($folder,'/') . '/';
return storage_path() . $folder . self::$bookXmlFiles[$book];
}
}

View File

@ -0,0 +1,130 @@
<?php
namespace App\Handlers;
use Illuminate\Support\Facades\URL;
/**
* LexiconHandler.php
*
* @author: Leonard Smith <leonard@acornwebconsultants.com>
* Date: 9/19/20
* Time: 8:56 AM
*/
class LexiconHandler
{
use BookXmlFilesTrait;
public function getEntriesJson($book, $chapter, $verse)
{
$entries = $this->getEntriesByVerse($book, $chapter, $verse);
$json = [];
foreach ($entries as $id => $entry) {
$json[] = self::formatForJson($id, $entry);
}
return $json;
}
public static function getEntryById($id)
{
$content = self::getLexicalContent($id);
return self::formatForJson($id, $content);
}
public function formatForJson($id, $entry)
{
list($lexeme, $commentary) = $this->parseEntry($entry);
return [
'type' => 'lexical-entries',
'id' => $id,
'attributes' => [
'strongs-number' => $id,
'lexeme' => $lexeme,
'commentary' => $commentary,
],
'links' => [
'self' => URL::route('lexicon-entry', $id),
],
];
}
/**
* Extract the lexeme and the commentary portion into separate variables
*
* @param $entry
* @return array
*/
public function parseEntry($entry)
{
$lexeme = '';
$commentary = '';
$n = 0;
foreach(preg_split('~[\r\n]+~', $entry) as $line){
if(empty($line) or ctype_space($line)) continue; // skip only spaces
if ($n === 0) {
$lexeme = ltrim($line, '#');
} else {
$commentary .= $line . "\n";
}
$n++;
}
return [$lexeme, $commentary];
}
public function getEntriesByVerse($book, $chapter, $verse)
{
$xmlFile = $this->getBookXmlFile('/ulb/', $book);
$document = new \DOMDocument;
$document->load($xmlFile);
$verseNodes = $document->getElementsByTagName('verse');
$entryArray = [];
foreach ($verseNodes as $vn) {
if ($vn->getAttribute('name') === ucfirst($book) . ' ' . $chapter . ':' . $verse) {
$entries = $vn->getElementsByTagName('w');
foreach ($entries as $entry) {
$strongsNumber = $entry->getAttribute('lemma');
$entryArray[$strongsNumber] = LexiconHandler::getLexicalContent($strongsNumber);
}
}
}
return $entryArray;
}
public function getLexicalContent($strongsNumber)
{
$filepath = $this->getFilePath($strongsNumber);
if (file_exists($filepath)) {
$contents = file_get_contents($filepath);
} else {
$contents = 'CONTENT NOT FOUND: ' . $filepath;
}
return $contents;
}
public function getFilePath($strongsNumber)
{
return $this->getFolderName($strongsNumber) . lcfirst($strongsNumber) . '.md';
}
public function getFolderName($strongsNumber)
{
$intPortion = substr($strongsNumber, 1);
$intValue = intval($intPortion);
$upperValue = ceil($intValue / 10) * 10;
$lowerValue = (floor($intValue / 10) * 10) + 1;
return storage_path() . '/gwt/' . 'g' . $lowerValue . '-' . 'g' . $upperValue . '/';
}
}

View File

@ -0,0 +1,155 @@
<?php
namespace App\Handlers;
use Illuminate\Support\Facades\URL;
/**
* TextMorphologyHandler.php
*
* @author: Leonard Smith <leonard@acornwebconsultants.com>
* Date: 8/18/20
* Time: 7:31 PM
*/
class TextMorphologyHandler
{
use BookXmlFilesTrait;
public function getVerse($book, $chapter, $verse)
{
$document = $this->openBookXml($book);
$verseNodes = $document->getElementsByTagName('verse');
$words = [];
foreach ($verseNodes as $vn) {
if ($vn->getAttribute('name') === ucfirst($book) . ' ' . $chapter . ':' . $verse) {
$wordNodes = $vn->getElementsByTagName('w');
foreach ($wordNodes as $w) {
$words[] = [
'ulb' => $w->nodeValue,
'greek' => $w->getAttribute('text'),
'morph' => $w->getAttribute('morph'),
'ognt-sort' => $w->getAttribute('OGNTsort'),
'lexeme' => $w->getAttribute('lexeme'),
];
}
}
}
return $words;
}
public function getWordsJson($book, $chapter, $verse)
{
$words = $this->getVerse($book, $chapter, $verse);
$json = [];
foreach ($words as $word) {
$json[] = $this->getWordJson(implode('-', [$book, $chapter, $verse, $word['ognt-sort']]), $word);
}
return json_encode($json);
}
protected function getWordJson($id, $word)
{
return [
'type' => 'words',
'id' => $id,
'attributes' => $word,
'links' => [
'self' => URL::route('verse-word', $id),
]
];
}
public function getWordById($id)
{
$idParts = explode('-', $id);
$book = $idParts[0];
$chapter = $idParts[1];
$verse = $idParts[2];
$ogntSort = $idParts[3];
$words = $this->getVerse($book, $chapter, $verse);
$str = [];
foreach ($words as $word) {
$str[] = $word;
if ($word['ognt-sort'] == $ogntSort) {
return $this->getWordJson($id, $word);
}
}
return "NOT FOUND: $ogntSort\n";
}
protected function openBookXml($book)
{
$book = strtolower($book);
$xmlFile = $this->getBookXmlFile('/ulb', $book);
$document = new \DOMDocument;
$document->load($xmlFile);
return $document;
}
public function getChaptersByBook($book)
{
$document = $this->openBookXml($book);
$chapterNodes = $document->getElementsByTagName('chapter');
$data = [];
foreach ($chapterNodes as $node) {
$data[] = $this->getChapterDataFromNode($node);
}
return json_encode($data);
}
public function getChapterDataById($id)
{
}
public function getChapterDataFromNode($node)
{
return [
'chapter' => $this->getChapterNameFromOsisId($node->getAttribute('osisID')),
'verses' => $this->getUlbChapterVerses($node),
];
}
public function getChapterNameFromOsisId($osisId)
{
preg_match("|^[A-Za-z1-3]*.([0-9]*)$|", $osisId, $matches);
return $matches[1];
}
public function getUlbChapterVerses($xmlNode)
{
$verseNodes = $xmlNode->getElementsByTagName('verse');
$verses = [];
foreach ($verseNodes as $vn) {
$verse['number'] = $this->getVerseNumber($vn->getAttribute('name'));
$verse['text'] = $vn->getElementsByTagName('ULB')->item(0)->nodeValue;
$verses[] = $verse;
}
return $verses;
}
protected function getVerseNumber($string)
{
preg_match("|^[A-Za-z1-3]*\s*[0-9]*:([0-9]*)$|", $string, $matches);
return $matches[1];
}
}

View File

@ -0,0 +1,173 @@
<?php
namespace App\Handlers;
use App\Book;
use App\Chapter;
use App\Word;
use App\Verse;
use DOMDocument;
use DOMElement;
/**
* UlbXmlImportHandler.php
*
* @author: Leonard Smith <leonard@acornwebconsultants.com>
* Date: 10/2/20
* Time: 8:04 AM
*/
class UlbXmlImportHandler
{
use BookXmlFilesTrait;
const REPO_FOLDER = '/ulb/';
/**
* @var array
*/
protected $availableFiles = [];
public function run() : void
{
// Collect available xml files
foreach (self::$bookXmlFiles as $name => $filename)
{
$filepath = $this->getBookXmlFilePath(self::REPO_FOLDER, $name);
if (file_exists($filepath)) {
$this->availableFiles[$name] = $filepath;
}
}
$this->loopOverXmlFiles();
}
public function loopOverXmlFiles() : void
{
foreach ($this->availableFiles as $name => $filepath) {
$this->importBook($name, $filepath);
}
}
/**
* @param string $bookTitle
* @param string $filepath
* @return Book
*/
public function importBook(string $bookTitle, string $filepath) : Book
{
/** @var DOMDocument $document */
$document = $this->openBook($filepath);
$book = Book::create([
'name' => $bookTitle,
]);
$chapterCollection = $document->getElementsByTagName('chapter');
foreach ($chapterCollection as $chapterElem) {
$chapter = $this->importChapter($chapterElem);
$book->chapters()->save($chapter);
}
return $book;
}
/**
* @param DOMElement $chapterElem
* @return Chapter
*/
public function importChapter(DOMElement $chapterElem) : Chapter
{
$chapter = Chapter::create([
'name' => $this->parseChapterName($chapterElem),
]);
$verseCollection = $chapterElem->getElementsByTagName('verse');
foreach ($verseCollection as $verseElem) {
$verse = $this->importVerse($verseElem);
$chapter->verses()->save($verse);
}
return $chapter;
}
/**
* @param DOMElement $verseElem
* @return Verse
*/
public function importVerse(DOMElement $verseElem) : Verse
{
$verse = Verse::create([
'name' => $this->parseVerseNumber($verseElem),
'greek_text' => $verseElem->getElementsByTagName('Greek')[0]->nodeValue,
'ulb_text' => $verseElem->getElementsByTagName('ULB')[0]->nodeValue,
]);
$wordCollection = $verseElem->getElementsByTagName('w');
foreach ($wordCollection as $wordElem) {
$word = $this->importWord($wordElem);
$verse->words()->save($word);
}
return $verse;
}
/**
* @param DOMElement $wordElem
* @return Word
*/
public function importWord(DOMElement $wordElem) : Word
{
// NOTE: We have to switch thins around a bit as the incoming XML file
// use lexeme for lemma and lemma for the strongs number
$word = Word::create([
'ulb' => $wordElem->nodeValue,
'greek' => $wordElem->getAttribute('text'),
'lemma' => $wordElem->getAttribute('lexeme'),
'morph' => $wordElem->getAttribute('morph'),
'ognt_sort' => $wordElem->getAttribute('OGNTsort'),
'strongs_number' => $wordElem->getAttribute('lemma'),
]);
return $word;
}
/**
* @param DOMElement $chapterElem
* @return string
*/
protected function parseChapterName(DOMElement $chapterElem) : string
{
$osisId = $chapterElem->getAttribute('osisID');
preg_match("|^[A-Za-z1-3]*.([0-9]*)$|", $osisId, $matches);
return $matches[1];
}
/**
* @param DOMElement $verseElem
* @return string
*/
protected function parseVerseNumber(DOMElement $verseElem) : string
{
$string = $verseElem->getAttribute('name');
preg_match("|^[A-Za-z1-3]*\s*[0-9]*:([0-9]*)$|", $string, $matches);
return $matches[1];
}
/**
* @param $filepath
* @return DOMDocument
*/
protected function openBook($filepath) : DOMDocument
{
$document = new \DOMDocument;
$document->load($filepath);
return $document;
}
}