Tips_and_Hacks/convert DokuWiki/convert.py

#convert.py
#version 0.3
# by John Wood -- for Tech Advance
# This script reads through the monolithic .md files in a DokuWiki copy of OBS and splits them
# out into chunks in folders named like the .md files. The chunks are named 01.txt to nn.txt
# where `nn` is the last chunk.
# The folders and files are automatically named correctly. The script now creates a
# manifest.json file as well.

# Usage: python convert.py <path to DokuWiki OBS files>

#Import necessary python components

import os       # file system commands
import re       # regular expressions
import sys      # command line arguments
import shutil   # high level file operations

from subprocess import call # to fork for git

program_name=sys.argv[0]
arguments=sys.argv[1:]
count_args=len(arguments)
if count_args !=1 : #If there is not exactly one argument, fail with a usage remark.
    print ("convert.py script to convert DokuWiki OBS to translationStudio format")
    print ("Usage: python converty.py <old directory>")
    sys.exit(1)

convertdir=sys.argv[1]
projectinfo=convertdir.split("_") # splitting the argument on undescores will give us the
                                   # information about the OBS project
oldpath=projectinfo[0] # The first element of the argument is the path to the old project
path=oldpath.split("/") # we can then learn more about the project from its path
language=path[len(path)-1] # The last element in the path is the language code. We have to
                            # subtract one because the first element is 0, rather than 1

book=projectinfo[1] # The book name, then, should be the second part of the project name

if len(projectinfo)!=2: # tS projects look like lll_obs_text_obs rather than lll_obs
    print("This may not be a DokuWiki OBS project and I'm not sure how to handle it")
    sys.exit(1)

elif book=="obs":
    targetpath=convertdir+"_text_obs" #this gets the target name into the right format

worksite=convertdir+"/content/"

for filename in os.listdir(worksite): #the actual content is in a subdirectory
    if(filename=="front"):
        if not os.path.exists(targetpath+"/front"):
            os.makedirs(targetpath+"/front")
        for filename2 in os.listdir(worksite+filename+"/"):
            shutil.copyfile(worksite+filename+"/"+filename2, targetpath+"/front/"+filename2.replace(".md",".txt"))
            #copy the file to the new location, changing its extension to .txt
    elif filename.endswith(".md"): # all other files we deal with are MarkDown files
        filenum=01
        # We start by making a folder/directory matching the name of the md file
        newpath=filename.replace(".md","")
        newpath=targetpath+"/"+newpath
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        filename=worksite+filename

        with open(filename) as mdfile: #we open the old md to scan it
            for line in mdfile:
                if re.match("\!\[Image\]",line):
                    #line is an image: increment the file counter
                    newFileName = newpath + "/{:0>2d}".format(filenum)+".txt"
                    filenum = filenum+1
                    writeLine=""
                elif re.match(r'\[\[https',line):
                    #line is an image: increment the file counter
                    newFileName = newpath + "/{:0>2d}".format(filenum)+".txt"
                    filenum = filenum+1
                    writeLine=""
                elif re.match("_",line):
                    #lines with underscores are the references
                    newFileName = newpath+"/reference.txt"
                    writeLine=line.replace("_","")
                elif re.match("#",line):
                    #matching title
                    newFileName = newpath+"/title.txt"
                    writeLine=line.replace("#","")
                elif not line.strip("\s")=="\n":
                    # checking that the line isn't blank
                    writeLine=line
                elif line.strip("\s")=="\n":
                    writeLine=""
                with open(newFileName, "a+") as newfile:
                    newfile.write(writeLine)

newfile.close()

with open(convertdir+"/manifest.yaml") as manfile:
    with open(targetpath+"/manifest.json","a+") as newmanfile:
        for manline in manfile:
            title_match = re.search(r"^    title: (.+)",manline)
            direction_match = re.search(r"    direction: (\w+)",manline)
            modified_date_match= re.search(r"  modified: '(\d\d\d\d-\d\d-\d\d)'",manline)
            if modified_date_match:
                modified_date=modified_date_match.group(1)
            elif title_match:
                target_language_name=title_match.group(1)
            elif direction_match:
                target_direction = direction_match.group(1)

        newmanfile.write('\n'.join([
            '{',
            '	"package_version": 6,',
            '	"format": "markdown",',
            '	"generator": {',
            '		"name": "ts-desktop",',
            '		"build": "132"',
            '	},',
            '	"target_language": {',
            '		"id": "'+language+'",',
            '		"name": "'+target_language_name+'",',
            '		"direction": "'+target_direction+'"',
            '	},',
            '	"project": {',
            '		"id": "obs",',
            '		"name": "Open Bible Stories"',
            '	},',
            '	"type": {',
            '		"id": "text",',
            '		"name": "Text"',
            '	},',
            '	"resource": {',
            '		"id": "obs",',
            '		"name": "Open Bible Stories"',
            '	},',
            '	"source_translations": [',
            '		{',
            '			"language_id": "en",',
            '			"resource_id": "obs",',
            '			"checking_level": "3",',
            '			"date_modified": "'+modified_date+'",',
            '			"version": "4"',
            '		}',
            '	],',
            '	"parent_draft": {},',
            '	"translators": [],',
            '	"finished_chunks": []',
            '}']))

os.chdir(targetpath)
call(["git","init"])
call(["git","add","."])
call(["git","commit","-m Initial commit"])
print ("New project written in "+targetpath)