2017-10-31 02:47:47 +00:00
|
|
|
#convert.py
|
|
|
|
#version 0.3
|
|
|
|
# by John Wood -- for Tech Advance
|
|
|
|
# This script reads through the monolithic .md files in a DokuWiki copy of OBS and splits them
|
|
|
|
# out into chunks in folders named like the .md files. The chunks are named 01.txt to nn.txt
|
|
|
|
# where `nn` is the last chunk.
|
|
|
|
# The folders and files are automatically named correctly. The script now creates a
|
|
|
|
# manifest.json file as well.
|
2017-08-16 16:51:31 +00:00
|
|
|
|
2017-10-31 02:46:39 +00:00
|
|
|
# Usage: python convert.py <path to DokuWiki OBS files>
|
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
#Import necessary python components
|
2017-10-31 02:46:39 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
import os # file system commands
|
|
|
|
import re # regular expressions
|
|
|
|
import sys # command line arguments
|
|
|
|
import shutil # high level file operations
|
2017-10-31 02:46:39 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
from subprocess import call # to fork for git
|
2017-08-16 16:51:31 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
program_name=sys.argv[0]
|
|
|
|
arguments=sys.argv[1:]
|
|
|
|
count_args=len(arguments)
|
|
|
|
if count_args !=1 : #If there is not exactly one argument, fail with a usage remark.
|
|
|
|
print ("convert.py script to convert DokuWiki OBS to translationStudio format")
|
|
|
|
print ("Usage: python converty.py <old directory>")
|
|
|
|
sys.exit(1)
|
2017-10-31 02:46:39 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
convertdir=sys.argv[1]
|
|
|
|
projectinfo=convertdir.split("_") # splitting the argument on undescores will give us the
|
|
|
|
# information about the OBS project
|
|
|
|
oldpath=projectinfo[0] # The first element of the argument is the path to the old project
|
|
|
|
path=oldpath.split("/") # we can then learn more about the project from its path
|
|
|
|
language=path[len(path)-1] # The last element in the path is the language code. We have to
|
|
|
|
# subtract one because the first element is 0, rather than 1
|
|
|
|
|
|
|
|
book=projectinfo[1] # The book name, then, should be the second part of the project name
|
2017-10-31 02:46:39 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
if len(projectinfo)!=2: # tS projects look like lll_obs_text_obs rather than lll_obs
|
|
|
|
print("This may not be a DokuWiki OBS project and I'm not sure how to handle it")
|
2017-10-31 02:46:39 +00:00
|
|
|
sys.exit(1)
|
2017-10-31 02:47:47 +00:00
|
|
|
|
2017-10-31 02:46:39 +00:00
|
|
|
elif book=="obs":
|
2017-10-31 02:47:47 +00:00
|
|
|
targetpath=convertdir+"_text_obs" #this gets the target name into the right format
|
2017-10-31 02:46:39 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
worksite=convertdir+"/content/"
|
2017-10-31 02:46:39 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
for filename in os.listdir(worksite): #the actual content is in a subdirectory
|
|
|
|
if(filename=="front"):
|
2017-10-31 02:46:39 +00:00
|
|
|
if not os.path.exists(targetpath+"/front"):
|
|
|
|
os.makedirs(targetpath+"/front")
|
2017-10-31 02:47:47 +00:00
|
|
|
for filename2 in os.listdir(worksite+filename+"/"):
|
|
|
|
shutil.copyfile(worksite+filename+"/"+filename2, targetpath+"/front/"+filename2.replace(".md",".txt"))
|
|
|
|
#copy the file to the new location, changing its extension to .txt
|
|
|
|
elif filename.endswith(".md"): # all other files we deal with are MarkDown files
|
|
|
|
filenum=01
|
|
|
|
# We start by making a folder/directory matching the name of the md file
|
|
|
|
newpath=filename.replace(".md","")
|
|
|
|
newpath=targetpath+"/"+newpath
|
2017-08-16 16:44:17 +00:00
|
|
|
if not os.path.exists(newpath):
|
|
|
|
os.makedirs(newpath)
|
2017-10-31 02:47:47 +00:00
|
|
|
filename=worksite+filename
|
2017-10-31 02:46:39 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
with open(filename) as mdfile: #we open the old md to scan it
|
2017-08-16 16:44:17 +00:00
|
|
|
for line in mdfile:
|
2017-10-31 02:47:47 +00:00
|
|
|
if re.match("\!\[Image\]",line):
|
|
|
|
#line is an image: increment the file counter
|
|
|
|
newFileName = newpath + "/{:0>2d}".format(filenum)+".txt"
|
|
|
|
filenum = filenum+1
|
|
|
|
writeLine=""
|
|
|
|
elif re.match(r'\[\[https',line):
|
|
|
|
#line is an image: increment the file counter
|
|
|
|
newFileName = newpath + "/{:0>2d}".format(filenum)+".txt"
|
|
|
|
filenum = filenum+1
|
|
|
|
writeLine=""
|
2017-08-16 16:44:17 +00:00
|
|
|
elif re.match("_",line):
|
2017-10-31 02:47:47 +00:00
|
|
|
#lines with underscores are the references
|
|
|
|
newFileName = newpath+"/reference.txt"
|
|
|
|
writeLine=line.replace("_","")
|
|
|
|
elif re.match("#",line):
|
|
|
|
#matching title
|
|
|
|
newFileName = newpath+"/title.txt"
|
|
|
|
writeLine=line.replace("#","")
|
2017-10-31 02:46:39 +00:00
|
|
|
elif not line.strip("\s")=="\n":
|
2017-10-31 02:47:47 +00:00
|
|
|
# checking that the line isn't blank
|
|
|
|
writeLine=line
|
|
|
|
elif line.strip("\s")=="\n":
|
|
|
|
writeLine=""
|
|
|
|
with open(newFileName, "a+") as newfile:
|
|
|
|
newfile.write(writeLine)
|
|
|
|
|
|
|
|
newfile.close()
|
2017-10-31 02:46:39 +00:00
|
|
|
|
|
|
|
with open(convertdir+"/manifest.yaml") as manfile:
|
|
|
|
with open(targetpath+"/manifest.json","a+") as newmanfile:
|
|
|
|
for manline in manfile:
|
|
|
|
title_match = re.search(r"^ title: (.+)",manline)
|
|
|
|
direction_match = re.search(r" direction: (\w+)",manline)
|
|
|
|
modified_date_match= re.search(r" modified: '(\d\d\d\d-\d\d-\d\d)'",manline)
|
|
|
|
if modified_date_match:
|
2017-10-31 02:47:47 +00:00
|
|
|
modified_date=modified_date_match.group(1)
|
2017-10-31 02:46:39 +00:00
|
|
|
elif title_match:
|
2017-10-31 02:47:47 +00:00
|
|
|
target_language_name=title_match.group(1)
|
2017-10-31 02:46:39 +00:00
|
|
|
elif direction_match:
|
2017-10-31 02:47:47 +00:00
|
|
|
target_direction = direction_match.group(1)
|
2017-10-31 02:46:39 +00:00
|
|
|
|
|
|
|
newmanfile.write('\n'.join([
|
|
|
|
'{',
|
|
|
|
' "package_version": 6,',
|
|
|
|
' "format": "markdown",',
|
|
|
|
' "generator": {',
|
|
|
|
' "name": "ts-desktop",',
|
|
|
|
' "build": "132"',
|
|
|
|
' },',
|
|
|
|
' "target_language": {',
|
|
|
|
' "id": "'+language+'",',
|
|
|
|
' "name": "'+target_language_name+'",',
|
|
|
|
' "direction": "'+target_direction+'"',
|
|
|
|
' },',
|
|
|
|
' "project": {',
|
|
|
|
' "id": "obs",',
|
|
|
|
' "name": "Open Bible Stories"',
|
|
|
|
' },',
|
|
|
|
' "type": {',
|
|
|
|
' "id": "text",',
|
|
|
|
' "name": "Text"',
|
|
|
|
' },',
|
|
|
|
' "resource": {',
|
|
|
|
' "id": "obs",',
|
|
|
|
' "name": "Open Bible Stories"',
|
|
|
|
' },',
|
|
|
|
' "source_translations": [',
|
|
|
|
' {',
|
|
|
|
' "language_id": "en",',
|
|
|
|
' "resource_id": "obs",',
|
|
|
|
' "checking_level": "3",',
|
|
|
|
' "date_modified": "'+modified_date+'",',
|
|
|
|
' "version": "4"',
|
|
|
|
' }',
|
|
|
|
' ],',
|
|
|
|
' "parent_draft": {},',
|
|
|
|
' "translators": [],',
|
|
|
|
' "finished_chunks": []',
|
2017-10-31 02:47:47 +00:00
|
|
|
'}']))
|
2017-10-31 02:46:39 +00:00
|
|
|
|
2017-10-31 02:47:47 +00:00
|
|
|
os.chdir(targetpath)
|
|
|
|
call(["git","init"])
|
|
|
|
call(["git","add","."])
|
|
|
|
call(["git","commit","-m Initial commit"])
|
|
|
|
print ("New project written in "+targetpath)
|