1######################## BEGIN LICENSE BLOCK ########################
2# The Original Code is mozilla.org code.
3#
4# The Initial Developer of the Original Code is
5# Netscape Communications Corporation.
6# Portions created by the Initial Developer are Copyright (C) 1998
7# the Initial Developer. All Rights Reserved.
8#
9# Contributor(s):
10#   Mark Pilgrim - port to Python
11#
12# This library is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16#
17# This library is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20# Lesser General Public License for more details.
21#
22# You should have received a copy of the GNU Lesser General Public
23# License along with this library; if not, write to the Free Software
24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25# 02110-1301  USA
26######################### END LICENSE BLOCK #########################
27
28from . import constants
29from .charsetprober import CharSetProber
30from .codingstatemachine import CodingStateMachine
31from .mbcssm import UTF8SMModel
32
33ONE_CHAR_PROB = 0.5
34
35
36class UTF8Prober(CharSetProber):
37    def __init__(self):
38        CharSetProber.__init__(self)
39        self._mCodingSM = CodingStateMachine(UTF8SMModel)
40        self.reset()
41
42    def reset(self):
43        CharSetProber.reset(self)
44        self._mCodingSM.reset()
45        self._mNumOfMBChar = 0
46
47    def get_charset_name(self):
48        return "utf-8"
49
50    def feed(self, aBuf):
51        for c in aBuf:
52            codingState = self._mCodingSM.next_state(c)
53            if codingState == constants.eError:
54                self._mState = constants.eNotMe
55                break
56            elif codingState == constants.eItsMe:
57                self._mState = constants.eFoundIt
58                break
59            elif codingState == constants.eStart:
60                if self._mCodingSM.get_current_charlen() >= 2:
61                    self._mNumOfMBChar += 1
62
63        if self.get_state() == constants.eDetecting:
64            if self.get_confidence() > constants.SHORTCUT_THRESHOLD:
65                self._mState = constants.eFoundIt
66
67        return self.get_state()
68
69    def get_confidence(self):
70        unlike = 0.99
71        if self._mNumOfMBChar < 6:
72            for i in range(0, self._mNumOfMBChar):
73                unlike = unlike * ONE_CHAR_PROB
74            return 1.0 - unlike
75        else:
76            return unlike
77