#!/usr/bin/env python
#-*- coding: utf8  -*-
import os
import sys

# prerequsite: python-icu
import icu

# prerequsite: libthai
# *** python libthai binding, see https://veer66.wordpress.com/2009/03/10/%E0%B9%83%E0%B8%8A%E0%B9%89-libthai-%E0%B8%88%E0%B8%B2%E0%B8%81-python/ ***
import libthai

def usage():
    print """\
Usage: %s FILENAME > diff.txt
(FILENAME must be coding in utf-8)
""" % (sys.argv[0])

def pyicu_brk(txt):
    """thai word break by icu"""
    ustxt = icu.UnicodeString(txt)
    txt = unicode(ustxt)
    wi = icu.BreakIterator.createWordInstance(icu.Locale("th"))
    wi.setText(ustxt)
    pos = [ i for i in wi ]
    beg = 0
    brk_list = []
    for i in pos:
        brk_list.append(txt[beg:i].strip())
        beg = i
    return [ i for i in brk_list if i ]


def libthai_brk(txt):
    """thai word break by libthai"""
    try:
        return [ i.strip() for i in libthai.th_brk(txt) if i ]
    except:
        return []


def test_diff(txt):
    tl_icu = pyicu_brk(txt)
    tl_libthai = libthai_brk(txt.decode('utf8'))
    if not tl_icu == tl_libthai:
        print 'input:  ', txt
        print 'pyicu:  ', 
        for i in tl_icu: print i.encode('utf8'),
        print
        print 'libthai:',
        for i in tl_libthai: print i.encode('utf8'),
        print
        print


if __name__ == "__main__":
    try:
        filename = sys.argv[1]
        txt_list = open(filename).read().split('\n')
        for i in txt_list:
            for j in i.split(' '):
                j = j.strip()
                if j:
                    test_diff(j)
    except:
        usage()
