You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

gzip.py 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359
  1. """Functions that read and write gzipped files.
  2. The user of the file doesn't have to worry about the compression,
  3. but random access is not allowed."""
  4. # based on Andrew Kuchling's minigzip.py distributed with the zlib module
  5. import struct, sys, time
  6. import zlib
  7. import __builtin__
  8. __all__ = ["GzipFile","open"]
  9. FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
  10. READ, WRITE = 1, 2
  11. def write32(output, value):
  12. output.write(struct.pack("<l", value))
  13. def write32u(output, value):
  14. if value < 0:
  15. value = value + 0x100000000L
  16. output.write(struct.pack("<L", value))
  17. def read32(input):
  18. return struct.unpack("<l", input.read(4))[0]
  19. def open(filename, mode="rb", compresslevel=9):
  20. return GzipFile(filename, mode, compresslevel)
  21. class GzipFile:
  22. myfileobj = None
  23. def __init__(self, filename=None, mode=None,
  24. compresslevel=9, fileobj=None):
  25. if fileobj is None:
  26. fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
  27. if filename is None:
  28. if hasattr(fileobj, 'name'): filename = fileobj.name
  29. else: filename = ''
  30. if mode is None:
  31. if hasattr(fileobj, 'mode'): mode = fileobj.mode
  32. else: mode = 'rb'
  33. if mode[0:1] == 'r':
  34. self.mode = READ
  35. # Set flag indicating start of a new member
  36. self._new_member = 1
  37. self.extrabuf = ""
  38. self.extrasize = 0
  39. self.filename = filename
  40. elif mode[0:1] == 'w' or mode[0:1] == 'a':
  41. self.mode = WRITE
  42. self._init_write(filename)
  43. self.compress = zlib.compressobj(compresslevel,
  44. zlib.DEFLATED,
  45. -zlib.MAX_WBITS,
  46. zlib.DEF_MEM_LEVEL,
  47. 0)
  48. else:
  49. raise ValueError, "Mode " + mode + " not supported"
  50. self.fileobj = fileobj
  51. if self.mode == WRITE:
  52. self._write_gzip_header()
  53. def __repr__(self):
  54. s = repr(self.fileobj)
  55. return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
  56. def _init_write(self, filename):
  57. if filename[-3:] != '.gz':
  58. filename = filename + '.gz'
  59. self.filename = filename
  60. self.crc = zlib.crc32("")
  61. self.size = 0
  62. self.writebuf = []
  63. self.bufsize = 0
  64. def _write_gzip_header(self):
  65. self.fileobj.write('\037\213') # magic header
  66. self.fileobj.write('\010') # compression method
  67. fname = self.filename[:-3]
  68. flags = 0
  69. if fname:
  70. flags = FNAME
  71. self.fileobj.write(chr(flags))
  72. write32u(self.fileobj, long(time.time()))
  73. self.fileobj.write('\002')
  74. self.fileobj.write('\377')
  75. if fname:
  76. self.fileobj.write(fname + '\000')
  77. def _init_read(self):
  78. self.crc = zlib.crc32("")
  79. self.size = 0
  80. def _read_gzip_header(self):
  81. magic = self.fileobj.read(2)
  82. if magic != '\037\213':
  83. raise IOError, 'Not a gzipped file'
  84. method = ord( self.fileobj.read(1) )
  85. if method != 8:
  86. raise IOError, 'Unknown compression method'
  87. flag = ord( self.fileobj.read(1) )
  88. # modtime = self.fileobj.read(4)
  89. # extraflag = self.fileobj.read(1)
  90. # os = self.fileobj.read(1)
  91. self.fileobj.read(6)
  92. if flag & FEXTRA:
  93. # Read & discard the extra field, if present
  94. xlen=ord(self.fileobj.read(1))
  95. xlen=xlen+256*ord(self.fileobj.read(1))
  96. self.fileobj.read(xlen)
  97. if flag & FNAME:
  98. # Read and discard a null-terminated string containing the filename
  99. while (1):
  100. s=self.fileobj.read(1)
  101. if not s or s=='\000': break
  102. if flag & FCOMMENT:
  103. # Read and discard a null-terminated string containing a comment
  104. while (1):
  105. s=self.fileobj.read(1)
  106. if not s or s=='\000': break
  107. if flag & FHCRC:
  108. self.fileobj.read(2) # Read & discard the 16-bit header CRC
  109. def write(self,data):
  110. if self.fileobj is None:
  111. raise ValueError, "write() on closed GzipFile object"
  112. if len(data) > 0:
  113. self.size = self.size + len(data)
  114. self.crc = zlib.crc32(data, self.crc)
  115. self.fileobj.write( self.compress.compress(data) )
  116. def writelines(self,lines):
  117. self.write(" ".join(lines))
  118. def read(self, size=-1):
  119. if self.extrasize <= 0 and self.fileobj is None:
  120. return ''
  121. readsize = 1024
  122. if size < 0: # get the whole thing
  123. try:
  124. while 1:
  125. self._read(readsize)
  126. readsize = readsize * 2
  127. except EOFError:
  128. size = self.extrasize
  129. else: # just get some more of it
  130. try:
  131. while size > self.extrasize:
  132. self._read(readsize)
  133. readsize = readsize * 2
  134. except EOFError:
  135. if size > self.extrasize:
  136. size = self.extrasize
  137. chunk = self.extrabuf[:size]
  138. self.extrabuf = self.extrabuf[size:]
  139. self.extrasize = self.extrasize - size
  140. return chunk
  141. def _unread(self, buf):
  142. self.extrabuf = buf + self.extrabuf
  143. self.extrasize = len(buf) + self.extrasize
  144. def _read(self, size=1024):
  145. if self.fileobj is None: raise EOFError, "Reached EOF"
  146. if self._new_member:
  147. # If the _new_member flag is set, we have to
  148. # jump to the next member, if there is one.
  149. #
  150. # First, check if we're at the end of the file;
  151. # if so, it's time to stop; no more members to read.
  152. pos = self.fileobj.tell() # Save current position
  153. self.fileobj.seek(0, 2) # Seek to end of file
  154. if pos == self.fileobj.tell():
  155. self.fileobj = None
  156. raise EOFError, "Reached EOF"
  157. else:
  158. self.fileobj.seek( pos ) # Return to original position
  159. self._init_read()
  160. self._read_gzip_header()
  161. self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
  162. self._new_member = 0
  163. # Read a chunk of data from the file
  164. buf = self.fileobj.read(size)
  165. # If the EOF has been reached, flush the decompression object
  166. # and mark this object as finished.
  167. if buf == "":
  168. uncompress = self.decompress.flush()
  169. self._read_eof()
  170. self.fileobj = None
  171. self._add_read_data( uncompress )
  172. raise EOFError, 'Reached EOF'
  173. uncompress = self.decompress.decompress(buf)
  174. self._add_read_data( uncompress )
  175. if self.decompress.unused_data != "":
  176. # Ending case: we've come to the end of a member in the file,
  177. # so seek back to the start of the unused data, finish up
  178. # this member, and read a new gzip header.
  179. # (The number of bytes to seek back is the length of the unused
  180. # data, minus 8 because _read_eof() will rewind a further 8 bytes)
  181. self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
  182. # Check the CRC and file size, and set the flag so we read
  183. # a new member on the next call
  184. self._read_eof()
  185. self._new_member = 1
  186. def _add_read_data(self, data):
  187. self.crc = zlib.crc32(data, self.crc)
  188. self.extrabuf = self.extrabuf + data
  189. self.extrasize = self.extrasize + len(data)
  190. self.size = self.size + len(data)
  191. def _read_eof(self):
  192. # We've read to the end of the file, so we have to rewind in order
  193. # to reread the 8 bytes containing the CRC and the file size.
  194. # We check the that the computed CRC and size of the
  195. # uncompressed data matches the stored values.
  196. self.fileobj.seek(-8, 1)
  197. crc32 = read32(self.fileobj)
  198. isize = read32(self.fileobj)
  199. if crc32%0x100000000L != self.crc%0x100000000L:
  200. raise ValueError, "CRC check failed"
  201. elif isize != self.size:
  202. raise ValueError, "Incorrect length of data produced"
  203. def close(self):
  204. if self.mode == WRITE:
  205. self.fileobj.write(self.compress.flush())
  206. write32(self.fileobj, self.crc)
  207. write32(self.fileobj, self.size)
  208. self.fileobj = None
  209. elif self.mode == READ:
  210. self.fileobj = None
  211. if self.myfileobj:
  212. self.myfileobj.close()
  213. self.myfileobj = None
  214. def __del__(self):
  215. try:
  216. if (self.myfileobj is None and
  217. self.fileobj is None):
  218. return
  219. except AttributeError:
  220. return
  221. self.close()
  222. def flush(self):
  223. self.fileobj.flush()
  224. def isatty(self):
  225. return 0
  226. def readline(self, size=-1):
  227. if size < 0: size = sys.maxint
  228. bufs = []
  229. orig_size = size
  230. readsize = min(100, size) # Read from the file in small chunks
  231. while 1:
  232. if size == 0:
  233. return "".join(bufs) # Return resulting line
  234. c = self.read(readsize)
  235. i = c.find('\n')
  236. if size is not None:
  237. # We set i=size to break out of the loop under two
  238. # conditions: 1) there's no newline, and the chunk is
  239. # larger than size, or 2) there is a newline, but the
  240. # resulting line would be longer than 'size'.
  241. if i==-1 and len(c) > size: i=size-1
  242. elif size <= i: i = size -1
  243. if i >= 0 or c == '':
  244. bufs.append(c[:i+1]) # Add portion of last chunk
  245. self._unread(c[i+1:]) # Push back rest of chunk
  246. return ''.join(bufs) # Return resulting line
  247. # Append chunk to list, decrease 'size',
  248. bufs.append(c)
  249. size = size - len(c)
  250. readsize = min(size, readsize * 2)
  251. def readlines(self, sizehint=0):
  252. # Negative numbers result in reading all the lines
  253. if sizehint <= 0: sizehint = sys.maxint
  254. L = []
  255. while sizehint > 0:
  256. line = self.readline()
  257. if line == "": break
  258. L.append( line )
  259. sizehint = sizehint - len(line)
  260. return L
  261. def writelines(self, L):
  262. for line in L:
  263. self.write(line)
  264. def _test():
  265. # Act like gzip; with -d, act like gunzip.
  266. # The input file is not deleted, however, nor are any other gzip
  267. # options or features supported.
  268. import sys
  269. args = sys.argv[1:]
  270. decompress = args and args[0] == "-d"
  271. if decompress:
  272. args = args[1:]
  273. if not args:
  274. args = ["-"]
  275. for arg in args:
  276. if decompress:
  277. if arg == "-":
  278. f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
  279. g = sys.stdout
  280. else:
  281. if arg[-3:] != ".gz":
  282. print "filename doesn't end in .gz:", `arg`
  283. continue
  284. f = open(arg, "rb")
  285. g = __builtin__.open(arg[:-3], "wb")
  286. else:
  287. if arg == "-":
  288. f = sys.stdin
  289. g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
  290. else:
  291. f = __builtin__.open(arg, "rb")
  292. g = open(arg + ".gz", "wb")
  293. while 1:
  294. chunk = f.read(1024)
  295. if not chunk:
  296. break
  297. g.write(chunk)
  298. if g is not sys.stdout:
  299. g.close()
  300. if f is not sys.stdin:
  301. f.close()
  302. if __name__ == '__main__':
  303. _test()