1######################## BEGIN LICENSE BLOCK ######################## 2# The Original Code is mozilla.org code. 3# 4# The Initial Developer of the Original Code is 5# Netscape Communications Corporation. 6# Portions created by the Initial Developer are Copyright (C) 1998 7# the Initial Developer. All Rights Reserved. 8# 9# Contributor(s): 10# Mark Pilgrim - port to Python 11# 12# This library is free software; you can redistribute it and/or 13# modify it under the terms of the GNU Lesser General Public 14# License as published by the Free Software Foundation; either 15# version 2.1 of the License, or (at your option) any later version. 16# 17# This library is distributed in the hope that it will be useful, 18# but WITHOUT ANY WARRANTY; without even the implied warranty of 19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20# Lesser General Public License for more details. 21# 22# You should have received a copy of the GNU Lesser General Public 23# License along with this library; if not, write to the Free Software 24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25# 02110-1301 USA 26######################### END LICENSE BLOCK ######################### 27 28from . import constants 29from .charsetprober import CharSetProber 30from .codingstatemachine import CodingStateMachine 31from .mbcssm import UTF8SMModel 32 33ONE_CHAR_PROB = 0.5 34 35 36class UTF8Prober(CharSetProber): 37 def __init__(self): 38 CharSetProber.__init__(self) 39 self._mCodingSM = CodingStateMachine(UTF8SMModel) 40 self.reset() 41 42 def reset(self): 43 CharSetProber.reset(self) 44 self._mCodingSM.reset() 45 self._mNumOfMBChar = 0 46 47 def get_charset_name(self): 48 return "utf-8" 49 50 def feed(self, aBuf): 51 for c in aBuf: 52 codingState = self._mCodingSM.next_state(c) 53 if codingState == constants.eError: 54 self._mState = constants.eNotMe 55 break 56 elif codingState == constants.eItsMe: 57 self._mState = constants.eFoundIt 58 break 59 elif codingState == constants.eStart: 60 if self._mCodingSM.get_current_charlen() >= 2: 61 self._mNumOfMBChar += 1 62 63 if self.get_state() == constants.eDetecting: 64 if self.get_confidence() > constants.SHORTCUT_THRESHOLD: 65 self._mState = constants.eFoundIt 66 67 return self.get_state() 68 69 def get_confidence(self): 70 unlike = 0.99 71 if self._mNumOfMBChar < 6: 72 for i in range(0, self._mNumOfMBChar): 73 unlike = unlike * ONE_CHAR_PROB 74 return 1.0 - unlike 75 else: 76 return unlike 77