diff options
Diffstat (limited to 'lib/jython/Lib/codecs.py')
-rw-r--r-- | lib/jython/Lib/codecs.py | 570 |
1 files changed, 570 insertions, 0 deletions
diff --git a/lib/jython/Lib/codecs.py b/lib/jython/Lib/codecs.py new file mode 100644 index 000000000..e04e9dd03 --- /dev/null +++ b/lib/jython/Lib/codecs.py @@ -0,0 +1,570 @@ +""" codecs -- Python Codec Registry, API and helpers.
+
+
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+
+"""#"
+
+import struct,types,__builtin__
+
+### Registry and builtin stateless codec functions
+
+try:
+ from _codecs import *
+except ImportError,why:
+ raise SystemError,\
+ 'Failed to load the builtin codecs: %s' % why
+
+__all__ = ["register","lookup","open","EncodedFile","BOM","BOM_BE",
+ "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"]
+
+### Constants
+
+#
+# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
+#
+BOM = struct.pack('=H',0xFEFF)
+#
+BOM_BE = BOM32_BE = '\376\377'
+# corresponds to Unicode U+FEFF in UTF-16 on big endian
+# platforms == ZERO WIDTH NO-BREAK SPACE
+BOM_LE = BOM32_LE = '\377\376'
+# corresponds to Unicode U+FFFE in UTF-16 on little endian
+# platforms == defined as being an illegal Unicode character
+
+#
+# 64-bit Byte Order Marks
+#
+BOM64_BE = '\000\000\376\377'
+# corresponds to Unicode U+0000FEFF in UCS-4
+BOM64_LE = '\377\376\000\000'
+# corresponds to Unicode U+0000FFFE in UCS-4
+
+
+### Codec base classes (defining the API)
+
+class Codec:
+
+ """ Defines the interface for stateless encoders/decoders.
+
+ The .encode()/.decode() methods may implement different error
+ handling schemes by providing the errors argument. These
+ string values are defined:
+
+ 'strict' - raise a ValueError error (or a subclass)
+ 'ignore' - ignore the character and continue with the next
+ 'replace' - replace with a suitable replacement character;
+ Python will use the official U+FFFD REPLACEMENT
+ CHARACTER for the builtin Unicode codecs.
+
+ """
+ def encode(self,input,errors='strict'):
+
+ """ Encodes the object input and returns a tuple (output
+ object, length consumed).
+
+ errors defines the error handling to apply. It defaults to
+ 'strict' handling.
+
+ The method may not store state in the Codec instance. Use
+ StreamCodec for codecs which have to keep state in order to
+ make encoding/decoding efficient.
+
+ The encoder must be able to handle zero length input and
+ return an empty object of the output object type in this
+ situation.
+
+ """
+ raise NotImplementedError
+
+ def decode(self,input,errors='strict'):
+
+ """ Decodes the object input and returns a tuple (output
+ object, length consumed).
+
+ input must be an object which provides the bf_getreadbuf
+ buffer slot. Python strings, buffer objects and memory
+ mapped files are examples of objects providing this slot.
+
+ errors defines the error handling to apply. It defaults to
+ 'strict' handling.
+
+ The method may not store state in the Codec instance. Use
+ StreamCodec for codecs which have to keep state in order to
+ make encoding/decoding efficient.
+
+ The decoder must be able to handle zero length input and
+ return an empty object of the output object type in this
+ situation.
+
+ """
+ raise NotImplementedError
+
+#
+# The StreamWriter and StreamReader class provide generic working
+# interfaces which can be used to implement new encodings submodules
+# very easily. See encodings/utf_8.py for an example on how this is
+# done.
+#
+
+class StreamWriter(Codec):
+
+ def __init__(self,stream,errors='strict'):
+
+ """ Creates a StreamWriter instance.
+
+ stream must be a file-like object open for writing
+ (binary) data.
+
+ The StreamWriter may implement different error handling
+ schemes by providing the errors keyword argument. These
+ parameters are defined:
+
+ 'strict' - raise a ValueError (or a subclass)
+ 'ignore' - ignore the character and continue with the next
+ 'replace'- replace with a suitable replacement character
+
+ """
+ self.stream = stream
+ self.errors = errors
+
+ def write(self, object):
+
+ """ Writes the object's contents encoded to self.stream.
+ """
+ data, consumed = self.encode(object,self.errors)
+ self.stream.write(data)
+
+ def writelines(self, list):
+
+ """ Writes the concatenated list of strings to the stream
+ using .write().
+ """
+ self.write(''.join(list))
+
+ def reset(self):
+
+ """ Flushes and resets the codec buffers used for keeping state.
+
+ Calling this method should ensure that the data on the
+ output is put into a clean state, that allows appending
+ of new fresh data without having to rescan the whole
+ stream to recover state.
+
+ """
+ pass
+
+ def __getattr__(self,name,
+
+ getattr=getattr):
+
+ """ Inherit all other methods from the underlying stream.
+ """
+ return getattr(self.stream,name)
+
+###
+
+class StreamReader(Codec):
+
+ def __init__(self,stream,errors='strict'):
+
+ """ Creates a StreamReader instance.
+
+ stream must be a file-like object open for reading
+ (binary) data.
+
+ The StreamReader may implement different error handling
+ schemes by providing the errors keyword argument. These
+ parameters are defined:
+
+ 'strict' - raise a ValueError (or a subclass)
+ 'ignore' - ignore the character and continue with the next
+ 'replace'- replace with a suitable replacement character;
+
+ """
+ self.stream = stream
+ self.errors = errors
+
+ def read(self, size=-1):
+
+ """ Decodes data from the stream self.stream and returns the
+ resulting object.
+
+ size indicates the approximate maximum number of bytes to
+ read from the stream for decoding purposes. The decoder
+ can modify this setting as appropriate. The default value
+ -1 indicates to read and decode as much as possible. size
+ is intended to prevent having to decode huge files in one
+ step.
+
+ The method should use a greedy read strategy meaning that
+ it should read as much data as is allowed within the
+ definition of the encoding and the given size, e.g. if
+ optional encoding endings or state markers are available
+ on the stream, these should be read too.
+
+ """
+ # Unsliced reading:
+ if size < 0:
+ return self.decode(self.stream.read(), self.errors)[0]
+
+ # Sliced reading:
+ read = self.stream.read
+ decode = self.decode
+ data = read(size)
+ i = 0
+ while 1:
+ try:
+ object, decodedbytes = decode(data, self.errors)
+ except ValueError,why:
+ # This method is slow but should work under pretty much
+ # all conditions; at most 10 tries are made
+ i = i + 1
+ newdata = read(1)
+ if not newdata or i > 10:
+ raise
+ data = data + newdata
+ else:
+ return object
+
+ def readline(self, size=None):
+
+ """ Read one line from the input stream and return the
+ decoded data.
+
+ Note: Unlike the .readlines() method, this method inherits
+ the line breaking knowledge from the underlying stream's
+ .readline() method -- there is currently no support for
+ line breaking using the codec decoder due to lack of line
+ buffering. Sublcasses should however, if possible, try to
+ implement this method using their own knowledge of line
+ breaking.
+
+ size, if given, is passed as size argument to the stream's
+ .readline() method.
+
+ """
+ if size is None:
+ line = self.stream.readline()
+ else:
+ line = self.stream.readline(size)
+ return self.decode(line,self.errors)[0]
+
+
+ def readlines(self, sizehint=0):
+
+ """ Read all lines available on the input stream
+ and return them as list of lines.
+
+ Line breaks are implemented using the codec's decoder
+ method and are included in the list entries.
+
+ sizehint, if given, is passed as size argument to the
+ stream's .read() method.
+
+ """
+ if sizehint is None:
+ data = self.stream.read()
+ else:
+ data = self.stream.read(sizehint)
+ return self.decode(data,self.errors)[0].splitlines(1)
+
+ def reset(self):
+
+ """ Resets the codec buffers used for keeping state.
+
+ Note that no stream repositioning should take place.
+ This method is primarily intended to be able to recover
+ from decoding errors.
+
+ """
+ pass
+
+ def __getattr__(self,name,
+
+ getattr=getattr):
+
+ """ Inherit all other methods from the underlying stream.
+ """
+ return getattr(self.stream,name)
+
+###
+
+class StreamReaderWriter:
+
+ """ StreamReaderWriter instances allow wrapping streams which
+ work in both read and write modes.
+
+ The design is such that one can use the factory functions
+ returned by the codec.lookup() function to construct the
+ instance.
+
+ """
+ # Optional attributes set by the file wrappers below
+ encoding = 'unknown'
+
+ def __init__(self,stream,Reader,Writer,errors='strict'):
+
+ """ Creates a StreamReaderWriter instance.
+
+ stream must be a Stream-like object.
+
+ Reader, Writer must be factory functions or classes
+ providing the StreamReader, StreamWriter interface resp.
+
+ Error handling is done in the same way as defined for the
+ StreamWriter/Readers.
+
+ """
+ self.stream = stream
+ self.reader = Reader(stream, errors)
+ self.writer = Writer(stream, errors)
+ self.errors = errors
+
+ def read(self,size=-1):
+
+ return self.reader.read(size)
+
+ def readline(self, size=None):
+
+ return self.reader.readline(size)
+
+ def readlines(self, sizehint=None):
+
+ return self.reader.readlines(sizehint)
+
+ def write(self,data):
+
+ return self.writer.write(data)
+
+ def writelines(self,list):
+
+ return self.writer.writelines(list)
+
+ def reset(self):
+
+ self.reader.reset()
+ self.writer.reset()
+
+ def __getattr__(self,name,
+
+ getattr=getattr):
+
+ """ Inherit all other methods from the underlying stream.
+ """
+ return getattr(self.stream,name)
+
+###
+
+class StreamRecoder:
+
+ """ StreamRecoder instances provide a frontend - backend
+ view of encoding data.
+
+ They use the complete set of APIs returned by the
+ codecs.lookup() function to implement their task.
+
+ Data written to the stream is first decoded into an
+ intermediate format (which is dependent on the given codec
+ combination) and then written to the stream using an instance
+ of the provided Writer class.
+
+ In the other direction, data is read from the stream using a
+ Reader instance and then return encoded data to the caller.
+
+ """
+ # Optional attributes set by the file wrappers below
+ data_encoding = 'unknown'
+ file_encoding = 'unknown'
+
+ def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
+
+ """ Creates a StreamRecoder instance which implements a two-way
+ conversion: encode and decode work on the frontend (the
+ input to .read() and output of .write()) while
+ Reader and Writer work on the backend (reading and
+ writing to the stream).
+
+ You can use these objects to do transparent direct
+ recodings from e.g. latin-1 to utf-8 and back.
+
+ stream must be a file-like object.
+
+ encode, decode must adhere to the Codec interface, Reader,
+ Writer must be factory functions or classes providing the
+ StreamReader, StreamWriter interface resp.
+
+ encode and decode are needed for the frontend translation,
+ Reader and Writer for the backend translation. Unicode is
+ used as intermediate encoding.
+
+ Error handling is done in the same way as defined for the
+ StreamWriter/Readers.
+
+ """
+ self.stream = stream
+ self.encode = encode
+ self.decode = decode
+ self.reader = Reader(stream, errors)
+ self.writer = Writer(stream, errors)
+ self.errors = errors
+
+ def read(self,size=-1):
+
+ data = self.reader.read(size)
+ data, bytesencoded = self.encode(data, self.errors)
+ return data
+
+ def readline(self,size=None):
+
+ if size is None:
+ data = self.reader.readline()
+ else:
+ data = self.reader.readline(size)
+ data, bytesencoded = self.encode(data, self.errors)
+ return data
+
+ def readlines(self,sizehint=None):
+
+ if sizehint is None:
+ data = self.reader.read()
+ else:
+ data = self.reader.read(sizehint)
+ data, bytesencoded = self.encode(data, self.errors)
+ return data.splitlines(1)
+
+ def write(self,data):
+
+ data, bytesdecoded = self.decode(data, self.errors)
+ return self.writer.write(data)
+
+ def writelines(self,list):
+
+ data = ''.join(list)
+ data, bytesdecoded = self.decode(data, self.errors)
+ return self.writer.write(data)
+
+ def reset(self):
+
+ self.reader.reset()
+ self.writer.reset()
+
+ def __getattr__(self,name,
+
+ getattr=getattr):
+
+ """ Inherit all other methods from the underlying stream.
+ """
+ return getattr(self.stream,name)
+
+### Shortcuts
+
+def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
+
+ """ Open an encoded file using the given mode and return
+ a wrapped version providing transparent encoding/decoding.
+
+ Note: The wrapped version will only accept the object format
+ defined by the codecs, i.e. Unicode objects for most builtin
+ codecs. Output is also codec dependent and will usually by
+ Unicode as well.
+
+ Files are always opened in binary mode, even if no binary mode
+ was specified. Thisis done to avoid data loss due to encodings
+ using 8-bit values. The default file mode is 'rb' meaning to
+ open the file in binary read mode.
+
+ encoding specifies the encoding which is to be used for the
+ the file.
+
+ errors may be given to define the error handling. It defaults
+ to 'strict' which causes ValueErrors to be raised in case an
+ encoding error occurs.
+
+ buffering has the same meaning as for the builtin open() API.
+ It defaults to line buffered.
+
+ The returned wrapped file object provides an extra attribute
+ .encoding which allows querying the used encoding. This
+ attribute is only available if an encoding was specified as
+ parameter.
+
+ """
+ if encoding is not None and \
+ 'b' not in mode:
+ # Force opening of the file in binary mode
+ mode = mode + 'b'
+ file = __builtin__.open(filename, mode, buffering)
+ if encoding is None:
+ return file
+ (e,d,sr,sw) = lookup(encoding)
+ srw = StreamReaderWriter(file, sr, sw, errors)
+ # Add attributes to simplify introspection
+ srw.encoding = encoding
+ return srw
+
+def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
+
+ """ Return a wrapped version of file which provides transparent
+ encoding translation.
+
+ Strings written to the wrapped file are interpreted according
+ to the given data_encoding and then written to the original
+ file as string using file_encoding. The intermediate encoding
+ will usually be Unicode but depends on the specified codecs.
+
+ Strings are read from the file using file_encoding and then
+ passed back to the caller as string using data_encoding.
+
+ If file_encoding is not given, it defaults to data_encoding.
+
+ errors may be given to define the error handling. It defaults
+ to 'strict' which causes ValueErrors to be raised in case an
+ encoding error occurs.
+
+ The returned wrapped file object provides two extra attributes
+ .data_encoding and .file_encoding which reflect the given
+ parameters of the same name. The attributes can be used for
+ introspection by Python programs.
+
+ """
+ if file_encoding is None:
+ file_encoding = data_encoding
+ encode, decode = lookup(data_encoding)[:2]
+ Reader, Writer = lookup(file_encoding)[2:]
+ sr = StreamRecoder(file,
+ encode,decode,Reader,Writer,
+ errors)
+ # Add attributes to simplify introspection
+ sr.data_encoding = data_encoding
+ sr.file_encoding = file_encoding
+ return sr
+
+### Helpers for charmap-based codecs
+
+def make_identity_dict(rng):
+
+ """ make_identity_dict(rng) -> dict
+
+ Return a dictionary where elements of the rng sequence are
+ mapped to themselves.
+
+ """
+ res = {}
+ for i in rng:
+ res[i]=i
+ return res
+
+### Tests
+
+if __name__ == '__main__':
+
+ import sys
+
+ # Make stdout translate Latin-1 output into UTF-8 output
+ sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
+
+ # Have stdin translate Latin-1 input into UTF-8 input
+ sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
|