mirror of
https://github.com/YunoHost-Apps/mediawiki_ynh.git
synced 2024-09-03 19:46:05 +02:00
391 lines
13 KiB
Python
Executable file
391 lines
13 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# @author Philip
|
|
import tarfile as tf
|
|
import zipfile as zf
|
|
import os, re, shutil, sys, platform
|
|
|
|
pyversion = platform.python_version()
|
|
islinux = platform.system().lower() == 'linux'
|
|
|
|
if pyversion[:3] in ['2.6', '2.7']:
|
|
import urllib as urllib_request
|
|
import codecs
|
|
open = codecs.open
|
|
_unichr = unichr
|
|
if sys.maxunicode < 0x10000:
|
|
def unichr(i):
|
|
if i < 0x10000:
|
|
return _unichr(i)
|
|
else:
|
|
return _unichr( 0xD7C0 + ( i>>10 ) ) + _unichr( 0xDC00 + ( i & 0x3FF ) )
|
|
elif pyversion[:2] == '3.':
|
|
import urllib.request as urllib_request
|
|
unichr = chr
|
|
|
|
def unichr2( *args ):
|
|
return [unichr( int( i.split('<')[0][2:], 16 ) ) for i in args]
|
|
|
|
def unichr3( *args ):
|
|
return [unichr( int( i[2:7], 16 ) ) for i in args if i[2:7]]
|
|
|
|
# DEFINE
|
|
UNIHAN_VER = '6.2.0'
|
|
SF_MIRROR = 'dfn'
|
|
SCIM_TABLES_VER = '0.5.11'
|
|
SCIM_PINYIN_VER = '0.5.92'
|
|
LIBTABE_VER = '0.2.3'
|
|
# END OF DEFINE
|
|
|
|
def download( url, dest ):
|
|
if os.path.isfile( dest ):
|
|
print( 'File %s is up to date.' % dest )
|
|
return
|
|
global islinux
|
|
if islinux:
|
|
# we use wget instead urlretrieve under Linux,
|
|
# because wget could display details like download progress
|
|
os.system( 'wget %s -O %s' % ( url, dest ) )
|
|
else:
|
|
print( 'Downloading from [%s] ...' % url )
|
|
urllib_request.urlretrieve( url, dest )
|
|
print( 'Download complete.\n' )
|
|
return
|
|
|
|
def uncompress( fp, member, encoding = 'U8' ):
|
|
name = member.rsplit( '/', 1 )[-1]
|
|
print( 'Extracting %s ...' % name )
|
|
fp.extract( member )
|
|
shutil.move( member, name )
|
|
if '/' in member:
|
|
shutil.rmtree( member.split( '/', 1 )[0] )
|
|
return open( name, 'rb', encoding, 'ignore' )
|
|
|
|
unzip = lambda path, member, encoding = 'U8': \
|
|
uncompress( zf.ZipFile( path ), member, encoding )
|
|
|
|
untargz = lambda path, member, encoding = 'U8': \
|
|
uncompress( tf.open( path, 'r:gz' ), member, encoding )
|
|
|
|
def parserCore( fp, pos, beginmark = None, endmark = None ):
|
|
if beginmark and endmark:
|
|
start = False
|
|
else: start = True
|
|
mlist = set()
|
|
for line in fp:
|
|
if beginmark and line.startswith( beginmark ):
|
|
start = True
|
|
continue
|
|
elif endmark and line.startswith( endmark ):
|
|
break
|
|
if start and not line.startswith( '#' ):
|
|
elems = line.split()
|
|
if len( elems ) < 2:
|
|
continue
|
|
elif len( elems[0] ) > 1 and \
|
|
len( elems[pos] ) > 1: # words only
|
|
mlist.add( elems[pos] )
|
|
return mlist
|
|
|
|
def tablesParser( path, name ):
|
|
""" Read file from scim-tables and parse it. """
|
|
global SCIM_TABLES_VER
|
|
src = 'scim-tables-%s/tables/zh/%s' % ( SCIM_TABLES_VER, name )
|
|
fp = untargz( path, src, 'U8' )
|
|
return parserCore( fp, 1, 'BEGIN_TABLE', 'END_TABLE' )
|
|
|
|
ezbigParser = lambda path: tablesParser( path, 'EZ-Big.txt.in' )
|
|
wubiParser = lambda path: tablesParser( path, 'Wubi.txt.in' )
|
|
zrmParser = lambda path: tablesParser( path, 'Ziranma.txt.in' )
|
|
|
|
def phraseParser( path ):
|
|
""" Read phrase_lib.txt and parse it. """
|
|
global SCIM_PINYIN_VER
|
|
src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
|
|
dst = 'phrase_lib.txt'
|
|
fp = untargz( path, src, 'U8' )
|
|
return parserCore( fp, 0 )
|
|
|
|
def tsiParser( path ):
|
|
""" Read tsi.src and parse it. """
|
|
src = 'libtabe/tsi-src/tsi.src'
|
|
dst = 'tsi.src'
|
|
fp = untargz( path, src, 'big5hkscs' )
|
|
return parserCore( fp, 0 )
|
|
|
|
def unihanParser( path ):
|
|
""" Read Unihan_Variants.txt and parse it. """
|
|
fp = unzip( path, 'Unihan_Variants.txt', 'U8' )
|
|
t2s = dict()
|
|
s2t = dict()
|
|
for line in fp:
|
|
if line.startswith( '#' ):
|
|
continue
|
|
else:
|
|
elems = line.split()
|
|
if len( elems ) < 3:
|
|
continue
|
|
type = elems.pop( 1 )
|
|
elems = unichr2( *elems )
|
|
if type == 'kTraditionalVariant':
|
|
s2t[elems[0]] = elems[1:]
|
|
elif type == 'kSimplifiedVariant':
|
|
t2s[elems[0]] = elems[1:]
|
|
fp.close()
|
|
return ( t2s, s2t )
|
|
|
|
def applyExcludes( mlist, path ):
|
|
""" Apply exclude rules from path to mlist. """
|
|
excludes = open( path, 'rb', 'U8' ).read().split()
|
|
excludes = [word.split( '#' )[0].strip() for word in excludes]
|
|
excludes = '|'.join( excludes )
|
|
excptn = re.compile( '.*(?:%s).*' % excludes )
|
|
diff = [mword for mword in mlist if excptn.search( mword )]
|
|
mlist.difference_update( diff )
|
|
return mlist
|
|
|
|
def charManualTable( path ):
|
|
fp = open( path, 'rb', 'U8' )
|
|
ret = {}
|
|
for line in fp:
|
|
elems = line.split( '#' )[0].split( '|' )
|
|
elems = unichr3( *elems )
|
|
if len( elems ) > 1:
|
|
ret[elems[0]] = elems[1:]
|
|
return ret
|
|
|
|
def toManyRules( src_table ):
|
|
tomany = set()
|
|
for ( f, t ) in src_table.iteritems():
|
|
for i in range( 1, len( t ) ):
|
|
tomany.add( t[i] )
|
|
return tomany
|
|
|
|
def removeRules( path, table ):
|
|
fp = open( path, 'rb', 'U8' )
|
|
texc = list()
|
|
for line in fp:
|
|
elems = line.split( '=>' )
|
|
f = t = elems[0].strip()
|
|
if len( elems ) == 2:
|
|
t = elems[1].strip()
|
|
f = f.strip('"').strip("'")
|
|
t = t.strip('"').strip("'")
|
|
if f:
|
|
try:
|
|
table.pop( f )
|
|
except:
|
|
pass
|
|
if t:
|
|
texc.append( t )
|
|
texcptn = re.compile( '^(?:%s)$' % '|'.join( texc ) )
|
|
for (tmp_f, tmp_t) in table.copy().iteritems():
|
|
if texcptn.match( tmp_t ):
|
|
table.pop( tmp_f )
|
|
return table
|
|
|
|
def customRules( path ):
|
|
fp = open( path, 'rb', 'U8' )
|
|
ret = dict()
|
|
for line in fp:
|
|
elems = line.split( '#' )[0].split()
|
|
if len( elems ) > 1:
|
|
ret[elems[0]] = elems[1]
|
|
return ret
|
|
|
|
def dictToSortedList( src_table, pos ):
|
|
return sorted( src_table.items(), key = lambda m: m[pos] )
|
|
|
|
def translate( text, conv_table ):
|
|
i = 0
|
|
while i < len( text ):
|
|
for j in range( len( text ) - i, 0, -1 ):
|
|
f = text[i:][:j]
|
|
t = conv_table.get( f )
|
|
if t:
|
|
text = text[:i] + t + text[i:][j:]
|
|
i += len(t) - 1
|
|
break
|
|
i += 1
|
|
return text
|
|
|
|
def manualWordsTable( path, conv_table, reconv_table ):
|
|
fp = open( path, 'rb', 'U8' )
|
|
reconv_table = {}
|
|
wordlist = [line.split( '#' )[0].strip() for line in fp]
|
|
wordlist = list( set( wordlist ) )
|
|
wordlist.sort( key = len, reverse = True )
|
|
while wordlist:
|
|
word = wordlist.pop()
|
|
new_word = translate( word, conv_table )
|
|
rcv_word = translate( word, reconv_table )
|
|
if word != rcv_word:
|
|
reconv_table[word] = word
|
|
reconv_table[new_word] = word
|
|
return reconv_table
|
|
|
|
def defaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
|
|
wordlist = list( src_wordlist )
|
|
wordlist.sort( key = len, reverse = True )
|
|
word_conv_table = {}
|
|
word_reconv_table = {}
|
|
conv_table = char_conv_table.copy()
|
|
reconv_table = char_reconv_table.copy()
|
|
tomanyptn = re.compile( '(?:%s)' % '|'.join( src_tomany ) )
|
|
while wordlist:
|
|
conv_table.update( word_conv_table )
|
|
reconv_table.update( word_reconv_table )
|
|
word = wordlist.pop()
|
|
new_word_len = word_len = len( word )
|
|
while new_word_len == word_len:
|
|
add = False
|
|
test_word = translate( word, reconv_table )
|
|
new_word = translate( word, conv_table )
|
|
if not reconv_table.get( new_word ) \
|
|
and ( test_word != word \
|
|
or ( tomanyptn.search( word ) \
|
|
and word != translate( new_word, reconv_table ) ) ):
|
|
word_conv_table[word] = new_word
|
|
word_reconv_table[new_word] = word
|
|
try:
|
|
word = wordlist.pop()
|
|
except IndexError:
|
|
break
|
|
new_word_len = len(word)
|
|
return word_reconv_table
|
|
|
|
def PHPArray( table ):
|
|
lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table if f and t]
|
|
return '\n'.join(lines)
|
|
|
|
def main():
|
|
#Get Unihan.zip:
|
|
url = 'http://www.unicode.org/Public/%s/ucd/Unihan.zip' % UNIHAN_VER
|
|
han_dest = 'Unihan.zip'
|
|
download( url, han_dest )
|
|
|
|
# Get scim-tables-$(SCIM_TABLES_VER).tar.gz:
|
|
url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
|
|
tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
|
|
download( url, tbe_dest )
|
|
|
|
# Get scim-pinyin-$(SCIM_PINYIN_VER).tar.gz:
|
|
url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
|
|
pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
|
|
download( url, pyn_dest )
|
|
|
|
# Get libtabe-$(LIBTABE_VER).tgz:
|
|
url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
|
|
lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
|
|
download( url, lbt_dest )
|
|
|
|
# Unihan.txt
|
|
( t2s_1tomany, s2t_1tomany ) = unihanParser( han_dest )
|
|
|
|
t2s_1tomany.update( charManualTable( 'trad2simp.manual' ) )
|
|
s2t_1tomany.update( charManualTable( 'simp2trad.manual' ) )
|
|
|
|
t2s_1to1 = dict( [( f, t[0] ) for ( f, t ) in t2s_1tomany.iteritems()] )
|
|
s2t_1to1 = dict( [( f, t[0] ) for ( f, t ) in s2t_1tomany.iteritems()] )
|
|
|
|
s_tomany = toManyRules( t2s_1tomany )
|
|
t_tomany = toManyRules( s2t_1tomany )
|
|
|
|
# noconvert rules
|
|
t2s_1to1 = removeRules( 'trad2simp_noconvert.manual', t2s_1to1 )
|
|
s2t_1to1 = removeRules( 'simp2trad_noconvert.manual', s2t_1to1 )
|
|
|
|
# the supper set for word to word conversion
|
|
t2s_1to1_supp = t2s_1to1.copy()
|
|
s2t_1to1_supp = s2t_1to1.copy()
|
|
t2s_1to1_supp.update( customRules( 'trad2simp_supp_set.manual' ) )
|
|
s2t_1to1_supp.update( customRules( 'simp2trad_supp_set.manual' ) )
|
|
|
|
# word to word manual rules
|
|
t2s_word2word_manual = manualWordsTable( 'simpphrases.manual', s2t_1to1_supp, t2s_1to1_supp )
|
|
t2s_word2word_manual.update( customRules( 'toSimp.manual' ) )
|
|
s2t_word2word_manual = manualWordsTable( 'tradphrases.manual', t2s_1to1_supp, s2t_1to1_supp )
|
|
s2t_word2word_manual.update( customRules( 'toTrad.manual' ) )
|
|
|
|
# word to word rules from input methods
|
|
t_wordlist = set()
|
|
s_wordlist = set()
|
|
t_wordlist.update( ezbigParser( tbe_dest ),
|
|
tsiParser( lbt_dest ) )
|
|
s_wordlist.update( wubiParser( tbe_dest ),
|
|
zrmParser( tbe_dest ),
|
|
phraseParser( pyn_dest ) )
|
|
|
|
# exclude
|
|
s_wordlist = applyExcludes( s_wordlist, 'simpphrases_exclude.manual' )
|
|
t_wordlist = applyExcludes( t_wordlist, 'tradphrases_exclude.manual' )
|
|
|
|
s2t_supp = s2t_1to1_supp.copy()
|
|
s2t_supp.update( s2t_word2word_manual )
|
|
t2s_supp = t2s_1to1_supp.copy()
|
|
t2s_supp.update( t2s_word2word_manual )
|
|
|
|
# parse list to dict
|
|
t2s_word2word = defaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
|
|
t2s_word2word.update( t2s_word2word_manual )
|
|
s2t_word2word = defaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
|
|
s2t_word2word.update( s2t_word2word_manual )
|
|
|
|
# Final tables
|
|
# sorted list toHans
|
|
t2s_1to1 = dict( [( f, t ) for ( f, t ) in t2s_1to1.iteritems() if f != t] )
|
|
toHans = dictToSortedList( t2s_1to1, 0 ) + dictToSortedList( t2s_word2word, 1 )
|
|
# sorted list toHant
|
|
s2t_1to1 = dict( [( f, t ) for ( f, t ) in s2t_1to1.iteritems() if f != t] )
|
|
toHant = dictToSortedList( s2t_1to1, 0 ) + dictToSortedList( s2t_word2word, 1 )
|
|
# sorted list toCN
|
|
toCN = dictToSortedList( customRules( 'toCN.manual' ), 1 )
|
|
# sorted list toHK
|
|
toHK = dictToSortedList( customRules( 'toHK.manual' ), 1 )
|
|
# sorted list toSG
|
|
toSG = dictToSortedList( customRules( 'toSG.manual' ), 1 )
|
|
# sorted list toTW
|
|
toTW = dictToSortedList( customRules( 'toTW.manual' ), 1 )
|
|
|
|
# Get PHP Array
|
|
php = '''<?php
|
|
/**
|
|
* Simplified / Traditional Chinese conversion tables
|
|
*
|
|
* Automatically generated using code and data in includes/zhtable/
|
|
* Do not modify directly!
|
|
*
|
|
* @file
|
|
*/
|
|
|
|
$zh2Hant = array(\n'''
|
|
php += PHPArray( toHant ) \
|
|
+ '\n);\n\n$zh2Hans = array(\n' \
|
|
+ PHPArray( toHans ) \
|
|
+ '\n);\n\n$zh2TW = array(\n' \
|
|
+ PHPArray( toTW ) \
|
|
+ '\n);\n\n$zh2HK = array(\n' \
|
|
+ PHPArray( toHK ) \
|
|
+ '\n);\n\n$zh2CN = array(\n' \
|
|
+ PHPArray( toCN ) \
|
|
+ '\n);\n\n$zh2SG = array(\n' \
|
|
+ PHPArray( toSG ) \
|
|
+ '\n);\n'
|
|
|
|
f = open( os.path.join( '..', '..', '..', 'includes', 'ZhConversion.php' ), 'wb', encoding = 'utf8' )
|
|
print ('Writing ZhConversion.php ... ')
|
|
f.write( php )
|
|
f.close()
|
|
|
|
# Remove temporary files
|
|
print ('Deleting temporary files ... ')
|
|
os.remove('EZ-Big.txt.in')
|
|
os.remove('phrase_lib.txt')
|
|
os.remove('tsi.src')
|
|
os.remove('Unihan_Variants.txt')
|
|
os.remove('Wubi.txt.in')
|
|
os.remove('Ziranma.txt.in')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|