Module documentFactory
[hide private]
[frames] | no frames]

Source Code for Module documentFactory

   1   
   2  import socket, time 
   3  socket.setdefaulttimeout(30) 
   4   
   5  from baseObjects import DocumentFactory 
   6  from document import StringDocument 
   7  from record import SaxRecord 
   8  from bootstrap import BSParser 
   9  from utils import elementType, getFirstData, flattenTexts, reader, verifyXPaths 
  10  import re, os, c3errors, tarfile, cStringIO, sys, gzip 
  11  import mimetypes, httplib, urllib, urlparse, urllib2 
  12  import commands, codecs, types 
  13  from ZSI.client import Binding 
  14  from PyZ3950 import zoom 
  15  import SRW 
  16  from c3errors import * 
  17  from ftplib import FTP 
  18  from GoogleSearch_services import * 
  19  from utils import reader 
  20   
  21  mimetypes.add_type('application/marc', '.marc') 
  22   
  23  # NB: 
  24  # cache = 0:  yield, no caching 
  25  # cache = 1:  step through, cache positions in stream 
  26  # cache = 2:  step through, cache full documents 
  27  # other cache values undefined 
  28   
30 streamLocation = "" 31 format = "" 32 schema = "" 33 codec = "" 34 factory = None 35 filterRe = None 36 stream = None 37 locations = [] 38 documents = [] 39 length = 0 40
41 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):
42 self.factory = factory 43 self.format = format 44 self.schema = schema 45 self.codec = codec 46 self.stream = self.open_stream(stream)
47
48 - def open_stream(self, stream):
49 if hasattr(stream, 'read') and hasattr(stream, 'seek'): 50 # is a stream 51 self.streamLocation = "UNKNOWN" 52 return stream 53 else: 54 if os.path.exists(stream): 55 # is a file 56 self.streamLocation = stream 57 if not os.path.isdir(stream): 58 if self.codec: 59 return codecs.open(self.streamLocation, 'r', self.codec) 60 else: 61 return file(self.streamLocation) 62 else: 63 return stream 64 else: 65 # is a string 66 self.streamLocation = "STRING" 67 return cStringIO.StringIO(stream)
68
69 - def fetch_document(self, idx):
70 if self.length and idx >= self.length: 71 raise StopIteration 72 if self.documents: 73 return self.documents[idx] 74 elif self.locations: 75 self.stream.seek(self.locations[idx][0]) 76 data = self.stream.read(self.locations[idx][1]) 77 return data 78 else: 79 raise StopIteration
80 81
82 -class TermHashDocumentStream(BaseDocumentStream):
83
84 - def open_stream(self, stream):
85 # is a hash... 86 self.streamLocation = "TERM-STRING" 87 return stream.keys()
88
89 - def find_documents(self, session, cache=0):
90 # step through terms 91 if cache == 0: 92 for k in self.stream: 93 yield StringDocument(k) 94 raise StopIteration 95 elif cache == 2: 96 documents = [] 97 for k in self.stream: 98 documents.append(StringDocument(k)) 99 self.documents = documents
100 101
102 -class XmlDocumentStream(BaseDocumentStream):
103 start = None 104 endtag = "" 105
106 - def __init__(self, session, stream, format, schema="", codec="", factory=None):
107 BaseDocumentStream.__init__(self, session, stream, format, schema, codec, factory) 108 if (not schema): 109 self.start = re.compile("<([-a-zA-Z0-9_.]+:)?([-a-zA-Z0-9_.]+)[\s>]") 110 self.endtag = "" 111 else: 112 self.start = re.compile("<%s[\s>]" % schema) 113 self.endtag = "</" + schema + ">"
114
115 - def find_documents(self, session, cache=0):
116 docs = [] 117 locs = [] 118 endtag = self.endtag 119 let = len(endtag) 120 myTell = 0 121 xpi = "" 122 line = "" 123 while True: 124 ol = len(line) 125 line += self.stream.read(1024) 126 pi = line.find("<?xml ") 127 if (pi > -1): 128 # Store info 129 endpi = line.find("?>") 130 xpi = line[pi:endpi+2] + "\n" 131 xpi= "" 132 m = self.start.search(line) 133 if m: 134 if not self.endtag: 135 endtag = "</%s>" % m.group()[1:-1] 136 let = len(endtag) 137 s = m.start() 138 line = line[s:] 139 myTell += s 140 start = myTell 141 end = -1 142 strStart = 0 143 while end == -1: 144 if strStart: 145 # allow for end tag to be broken across reads 146 end = line.find(endtag, strStart-let) 147 else: 148 end = line.find(endtag) 149 if end > 0: 150 tlen = end+len(endtag) 151 txt = line[:tlen] 152 line = line[tlen:] 153 myTell += tlen 154 if cache == 0: 155 yield StringDocument(xpi + txt, mimeType="text/xml", schema=self.schema) 156 elif cache == 1: 157 locs.append((start, tlen)) 158 elif cache == 2: 159 docs.append(StringDocument(xpi + txt, mimeType="text/xml", schema=self.schema)) 160 else: 161 strStart = len(line) 162 line += self.stream.read(1024) 163 if len(line) == ol and not m: 164 if cache == 0: 165 self.stream.close() 166 raise StopIteration 167 else: 168 break 169 self.stream.close() 170 self.locations = locs 171 self.documents = docs 172 self.length = max(len(locs), len(docs))
173 174
175 -class MarcDocumentStream(BaseDocumentStream):
176
177 - def find_documents(self, session, cache=0):
178 docs = [] 179 locs = [] 180 data = self.stream.read(1536) 181 myTell = 0 182 while data: 183 rt = data.find("\x1D") 184 while (rt > -1): 185 txt = data[:rt+1] 186 tlen = len(txt) 187 if cache == 0: 188 yield StringDocument(txt, mimeType="application/marc") 189 elif cache == 1: 190 locs.append((myTell, tlen)) 191 elif cache == 2: 192 docs.append(StringDocument(txt, mimeType="application/marc")) 193 data = data[rt+1:] 194 myTell += tlen 195 rt = data.find("\x1D") 196 dlen = len(data) 197 data += self.stream.read(1536) 198 if (len(data) == dlen): 199 # Junk at end of file 200 data = "" 201 self.stream.close() 202 self.locations = locs 203 self.documents = docs 204 self.length = max(len(locs), len(docs))
205 206 # XmlTapeDocStream 207 # ArcFileDocStream 208 # MetsDocStream 209 210
211 -class MultipleDocumentStream(BaseDocumentStream):
212
213 - def __init__(self, session, stream, format, schema=None, codec=None, factory=None ):