Module parser
[hide private]
[frames] | no frames]

Source Code for Module parser

  1   
  2  from configParser import C3Object 
  3  from baseObjects import Parser 
  4  from xml.dom.minidom import parseString as domParseString 
  5  from record import SaxRecord, MinidomRecord, FtDomRecord, SaxContentHandler, LxmlRecord 
  6   
  7  from xml.sax import ContentHandler, make_parser, parseString as saxParseString, ErrorHandler, InputSource as SaxInput 
  8  from xml.sax.saxutils import escape 
  9   
 10   
 11  from utils import flattenTexts, elementType 
 12  import re 
 13  import cStringIO, StringIO 
 14   
 15   
 16  # utility function to update data on record from document 
 17       
 18   
19 -class BaseParser(Parser):
20 - def copy_data(self, doc, rec):
21 rec.filename = doc.filename 22 rec.schema = doc.schema 23 rec.processHistory = doc.processHistory 24 rec.processHistory.append(self.id) 25 if doc.documentStore: 26 rec.parent = ('document', doc.documentStore, doc.id) 27 elif doc.parent: 28 rec.parent = doc.parent
29 30
31 -class MinidomParser(BaseParser):
32 """ Use default Python Minidom implementation to parse document """ 33
34 - def process_document(self, session, doc):
35 xml = doc.get_raw() 36 dom = domParseString(xml) 37 rec = MinidomRecord(dom, xml) 38 self.copy_data(doc, rec) 39 return rec
40 41
42 -class SaxParser(BaseParser):
43 """ Default SAX based parser. Creates SaxRecord """ 44
45 - def __init__(self, session, parent, config):
46 Parser.__init__(self, session, parent, config) 47 self.parser = make_parser() 48 self.errorHandler = ErrorHandler() 49 self.parser.setErrorHandler(self.errorHandler) 50 self.inputSource = SaxInput() 51 ch = SaxContentHandler() 52 self.contentHandler = ch 53 self.parser.setContentHandler(ch) 54 self.keepError = 1 55 56 if (self.get_setting(session, 'namespaces')): 57 self.parser.setFeature('http://xml.org/sax/features/namespaces', 1) 58 p = self.get_setting(session, 'attrHash') 59 if (p): 60 l = p.split() 61 for i in l: 62 (a,b) = i.split("@") 63 try: 64 ch.hashAttributesNames[a].append(b) 65 except: 66 ch.hashAttributesNames[a] = [b] 67 if self.get_setting(session, 'stripWhitespace'): 68 ch.stripWS = 1
69
70 - def process_document(self, session, doc):
71 72 xml = doc.get_raw() 73 self.inputSource.setByteStream(cStringIO.StringIO(xml)) 74 ch = self.contentHandler 75 ch.reinit() 76 try: 77 self.parser.parse(self.inputSource) 78 except: 79 # Splat. Reset self and reraise 80 if self.keepError: 81 # Work out path 82 path = [] 83 for l in ch.pathLines: 84 line = ch.currentText[l] 85 elemName = line[2:line.index('{')-1] 86 path.append("%s[@SAXID='%s']" % (elemName, l)) 87 self.errorPath = '/'.join(path) 88 else: 89 ch.reinit() 90 91 raise 92 rec = SaxRecord(ch.currentText, xml, recordSize=ch.recordSize) 93 rec.elementHash = ch.elementHash 94 self.copy_data(doc, rec) 95 ch.reinit() 96 return rec
97 98 try: 99 from lxml import etree 100 101 class LxmlParser(BaseParser): 102 """ lxml based Parser. Creates LxmlRecords """ 103 def process_document(self, session, doc): 104 # input must be stream 105 data = doc.get_raw() 106 et = etree.XML(data) 107 rec = LxmlRecord(et) 108 self.copy_data(doc, rec) 109 return rec
110 111 class LxmlSchemaParser(Parser): 112 pass 113 class LxmlRelaxNGParser(Parser): 114 pass 115 116 except: 117 # Define empty classes 118 class LxmlParser(Parser): 119 pass 120 121 122 from Ft.Xml import Sax, InputSource as FtInput 123 from Ft.Xml.Domlette import NonvalidatingReaderBase 124
125 -class FtParser(BaseParser, NonvalidatingReaderBase):
126 """ 4Suite based Parser. Creates FtDomRecords """
127 - def __init__(self, session, config, parent):
128 Parser.__init__(self, session, config, parent) 129 NonvalidatingReaderBase.__init__(self)
130
131 - def process_document(self, session, doc):
132 data = doc.get_raw() 133 dom = self.parseString(data, 'urn:foo') 134 rec = FtDomRecord(dom, data) 135 self.copy_data(doc, rec) 136 return rec
137
138 -class FtSaxParser(BaseParser):
139 """ 4Suite SAX based Parser. Creates SaxRecords """ 140
141 - def __init__(self, session, parent, config):
142 Parser.