2007年1月23日火曜日

FePyのElementtreeモジュール 3

既に見つけていたpyexapt.pyですが、これに手を加えてXMLであればXML宣言のencodingを使い、HTMLであればmetaのcharsetを使って、ユニコードに変換するように書き加えてみました。 そのソースコードを以下に示します。




# Copyright (c) 2005, 2006 Seo Sanghyeon



# A chapter from Dan Wahlin's "XML for ASP.NET Developers" is useful

# for understanding this code. Posted to informit.com, 2002-02-22.

# http://www.informit.com/articles/article.asp?p=25485



# 2005-11-16 sanxiyn Created

# 2006-08-18 sanxiyn Merged changes from Mark Rees

# * Adapted to the new way to load .NET libraries

# * Handle empty elements

# 2006-08-29 sanxiyn Added support for XML namespaces

# Simplified code a lot

# 2006-10-21 sanxiyn Minimal support for xml.sax

# 2006-10-24 sanxiyn Implemented ordered_attributes, namespace_prefixes

# 2006-10-27 sanxiyn Added expat.error

# 2006-10-29 sanxiyn Implemented Start/End NamespaceDeclHandler

# 2006-11-20 sanxiyn Merged changes from Fredrik Lundh

# * Handle multiple calls to Parse()



import clr

clr.AddReference("System.Xml")



from System import Enum

from System.IO import StringReader

from System.Xml import XmlReader, XmlNodeType



# xml.sax passes an undocumented keyword argument "intern" to ParserCreate.

# Let's ignore it.



def ParserCreate(encoding=None, namespace_separator=None, **kw):

return xmlparser(namespace_separator)



# Used by xml.sax

XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE = 1

# Used by Kid

XML_PARAM_ENTITY_PARSING_ALWAYS = 2



class error(Exception):

pass



class xmlparser(object):



__slots__ = [

# Internal

"_data",

"_separator",

"_reader",

"_ns_stack",

"_enc",



# Attributes

# Implemented

"ordered_attributes",

"namespace_prefixes",

# Stub for xml.dom

"buffer_text",

"specified_attributes",



# Handlers

# Implemented

"StartElementHandler",

"EndElementHandler",

"CharacterDataHandler",

"StartNamespaceDeclHandler",

"EndNamespaceDeclHandler",

# Stub for ElementTree

"DefaultHandlerExpand",

# Stub for xml.sax

"ProcessingInstructionHandler",

"UnparsedEntityDeclHandler",

"NotationDeclHandler",

"ExternalEntityRefHandler",

# Stub for xml.dom

"StartDoctypeDeclHandler",

"EntityDeclHandler",

"CommentHandler",

"StartCdataSectionHandler",

"EndCdataSectionHandler",

"XmlDeclHandler",

"ElementDeclHandler",

"AttlistDeclHandler",

# Stub for Kid

"DefaultHandler",

]



returns_unicode = True

intern = {}



def __init__(self, separator):

self._data = []

self._separator = separator

self._ns_stack = []

self.ordered_attributes = False

self.namespace_prefixes = False

from System.Text import Encoding

self._enc = Encoding.ASCII.WebName #デフォルトエンコーディング



def Parse(self, data, isfinal=False):

self._data.append(data)

if isfinal:

data = "".join(self._data)

self._data = None

self._parse(data)



def _qname(self):

separator = self._separator

reader = self._reader

if separator is None:

# Name convert unicode

return self._handle_unicodechar(reader.Name)

#return reader.Name

if reader.NamespaceURI:

# qname covert unicode

temp = reader.NamespaceURI + separator + self._handle_unicodechar(reader.LocalName)

#temp = reader.NamespaceURI + separator + reader.LocalName

if self.namespace_prefixes:

if reader.Prefix:

return temp + separator + reader.Prefix

else:

return temp

else:

# qname convert unicode

return self._handle_unicodechar(reader.LocalName)

#return reader.LocalName



def _parse(self, data):

#パッチ XmlReaderSettingでDTDの検証をしないに追加

from System.Xml import XmlReaderSettings

readersettings = XmlReaderSettings()

readersettings.ProhibitDtd = False

reader = XmlReader.Create(StringReader(data), readersettings)

self._reader = reader

while reader.Read():

nodetype = reader.NodeType

typename = Enum.GetName(XmlNodeType, nodetype)

# meta tag case in html, then call meta handler( get chareset) charsetを処理するハンドラの設定

if reader.Name.lower() == "meta":

typename = "meta"

handler = getattr(self, '_handle_' + typename, None)

if handler is not None:

handler()



def _handle_Element(self):

reader = self._reader

name = self._qname()

ns_stack = self._ns_stack

ns_stack.append(None)

if self.ordered_attributes:

attributes = []

else:

attributes = {}

while reader.MoveToNextAttribute():

if reader.Prefix == 'xmlns':

# LocalName convert unicode

prefix = self._handle_unicodechar(reader.LocalName)

#prefix = reader.LocalName

uri = reader.Value

ns_stack.append(prefix)

if hasattr(self, "StartNamespaceDeclHandler"):

self.StartNamespaceDeclHandler(prefix, uri)

continue

key = self._qname()

# attribute value convert unicode

value = self._handle_unicodechar(reader.Value)

#value = reader.Value

if self.ordered_attributes:

attributes.append(key)

attributes.append(value)

else:

attributes[key] = value

reader.MoveToElement()

if hasattr(self, "StartElementHandler"):

self.StartElementHandler(name, attributes)

# EndElement node is not generated for empty elements.

# Call its handler here.

if reader.IsEmptyElement:

self._handle_EndElement()



def _handle_EndElement(self):

name = self._qname()

if hasattr(self, "EndElementHandler"):

self.EndElementHandler(name)

ns_stack = self._ns_stack

while True:

prefix = ns_stack.pop()

if prefix is None:

break

if hasattr(self, "EndNamespaceDeclHandler"):

self.EndNamespaceDeclHandler(prefix)



def _handle_Text(self):

reader = self._reader

# text convert unicode

data = self._handle_unicodechar(reader.Value)

#data = reader.Value

if hasattr(self, "CharacterDataHandler"):

self.CharacterDataHandler(data)



# handle encoding from XmlDeclation XML宣言からencodingを取得

def _handle_XmlDeclaration(self):

enc = self._reader.GetAttribute("encoding")

if enc == "" :

return

self._enc = enc



# hadle charset from meta tag in html metaタグからcharsetを取得

def _handle_meta(self):

content = self._reader.GetAttribute("content")

if content == "":

return

content = content.lower()

i = content.find("charset=")

if i == -1:

return

chardata = "".join([content[j] for j in range(i + 8, len(content))])

charset = chardata.split(";")[0].strip()

self._enc = charset



# ユニコードに変換

def _handle_unicodechar(self, data):

if self._enc == "":

return data

try:

return unicode(data, self._enc)

except:

return data



# Stub for xml.sax

def SetBase(self, base):

pass

def SetParamEntityParsing(self, flag):

return True

# Stub for Kid

def UseForeignDTD(self):

pass

0 件のコメント: