/* * RandomAccessCharacterFile.java * * Copyright (C) 2008 Hideo at Yokohama * Copyright (C) 2008-2009 Erik Huelsmann * $Id$ * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * As a special exception, the copyright holders of this library give you * permission to link this library with independent modules to produce an * executable, regardless of the license terms of these independent * modules, and to copy and distribute the resulting executable under * terms of your choice, provided that you also meet, for each linked * independent module, the terms and conditions of the license of that * module. An independent module is a module which is not derived from * or based on this library. If you modify this library, you may extend * this exception to your version of the library, but you are not * obligated to do so. If you do not wish to do so, delete this * exception statement from your version. */ package org.armedbear.lisp.util; import java.io.IOException; import java.io.PushbackInputStream; import java.io.OutputStream; import java.io.RandomAccessFile; import java.io.PushbackReader; import java.io.Reader; import java.io.StringReader; import java.io.Writer; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; import java.nio.charset.UnsupportedCharsetException; import org.armedbear.lisp.Debug; import static org.armedbear.lisp.Lisp.error; import org.armedbear.lisp.SimpleError; import org.armedbear.lisp.SimpleString; public class RandomAccessCharacterFile { private class RandomAccessInputStream extends PushbackInputStream { public RandomAccessInputStream() { super(null); } private byte[] read_buf = new byte[1]; @Override public final int read() throws IOException { int len = read(read_buf); if (len == 1) { // byte is signed, char is unsigned, int is signed. // buf can hold 0xff, we want it as 0xff in int, not -1. return 0xff & (int) read_buf[0]; } else { return -1; } // ### BUG: 'int read()' is to return a *codepoint*, // not the half of a surrogate pair! } @Override public final int read(byte[] b, int off, int len) throws IOException { return RandomAccessCharacterFile.this.read(b, off, len); } @Override public final void unread(int b) throws IOException { RandomAccessCharacterFile.this.unreadByte((byte)b); } @Override public final void unread(byte[] b, int off, int len) throws IOException { for (int i = 0; i < len; i++) this.unread(b[off+i]); } @Override public final void unread(byte[] b) throws IOException { this.unread(b, 0, b.length); } @Override public final int available() throws IOException { return (int)(RandomAccessCharacterFile.this.length() - RandomAccessCharacterFile.this.position()); } @Override public final synchronized void mark(int readlimit) { } @Override public final boolean markSupported() { return false; } @Override public final synchronized void reset() throws IOException { throw new IOException("Operation not supported"); } @Override public final long skip(long n) throws IOException { RandomAccessCharacterFile.this.position(RandomAccessCharacterFile.this.position()+n); return n; } @Override public final int read(byte[] b) throws IOException { return this.read(b, 0, b.length); } @Override public final void close() throws IOException { RandomAccessCharacterFile.this.close(); } } private class RandomAccessOutputStream extends OutputStream { RandomAccessOutputStream() { } private byte[] buf = new byte[1]; public final void write(int b) throws IOException { buf[0] = (byte)b; RandomAccessCharacterFile.this.write(buf, 0, 1); } @Override public final void write(byte[] b) throws IOException { RandomAccessCharacterFile.this.write(b, 0, b.length); } @Override public final void write(byte[] b, int off, int len) throws IOException { RandomAccessCharacterFile.this.write(b, off, len); } @Override public final void flush() throws IOException { RandomAccessCharacterFile.this.flush(); } @Override public final void close() throws IOException { RandomAccessCharacterFile.this.close(); } } // dummy reader which we need to call the Pushback constructor // because a null value won't work static Reader staticReader = new StringReader(""); private class RandomAccessReader extends PushbackReader { RandomAccessReader() { // because we override all methods of Pushbackreader, // staticReader will never be referenced super(staticReader); } @Override public final void close() throws IOException { RandomAccessCharacterFile.this.close(); } private char[] read_buf = new char[1]; @Override public final int read() throws IOException { int n = this.read(read_buf); if (n == 1) return read_buf[0]; else return -1; // ### BUG: 'int read()' is to return a codepoint! // not the half of a surrogate pair! } @Override public final void unread(int c) throws IOException { RandomAccessCharacterFile.this.unreadChar((char)c); } @Override public final void unread(char[] cbuf, int off, int len) throws IOException { for (int i = 0; i < len; i++) this.unread(cbuf[off+i]); } @Override public final void unread(char[] cbuf) throws IOException { this.unread(cbuf, 0, cbuf.length); } @Override public final int read(CharBuffer target) throws IOException { //FIXME: to be implemented throw new IOException("Not implemented"); } @Override public final int read(char[] cbuf) throws IOException { return RandomAccessCharacterFile.this.read(cbuf, 0, cbuf.length); } @Override public final int read(char[] cb, int off, int len) throws IOException { return RandomAccessCharacterFile.this.read(cb, off, len); } @Override public final boolean ready() throws IOException { return true; } } private class RandomAccessWriter extends Writer { RandomAccessWriter() { } public final void close() throws IOException { RandomAccessCharacterFile.this.close(); } public final void flush() throws IOException { RandomAccessCharacterFile.this.flush(); } @Override public final void write(char[] cb, int off, int len) throws IOException { RandomAccessCharacterFile.this.write(cb, off, len); } } final static int BUFSIZ = 4*1024; // setting this to a small value like 8 is helpful for testing. private RandomAccessWriter writer; private RandomAccessReader reader; private RandomAccessInputStream inputStream; private RandomAccessOutputStream outputStream; private FileChannel fcn; private Charset cset; private CharsetEncoder cenc; private CharsetDecoder cdec; /** * bbuf is treated as a cache of the file content. * If it points to somewhere in the middle of the file, it holds the copy of the file content, * even when you are writing a large chunk of data. If you write in the middle of a file, * bbuf first gets filled with contents of the data, and only after that any new data is * written on bbuf. * The exception is when you are appending data at the end of the file. */ private ByteBuffer bbuf; private boolean bbufIsDirty; /* whether bbuf holds data that must be written. */ private boolean bbufIsReadable; /* whether bbuf.remaining() contains readable content. */ private long bbufpos; /* where the beginning of bbuf is pointing in the file now. */ public RandomAccessCharacterFile(RandomAccessFile raf, String encoding) throws IOException { fcn = raf.getChannel(); setEncoding(encoding); bbuf = ByteBuffer.allocate(BUFSIZ); // there is no readable data available in the buffers. bbuf.flip(); // there is no write pending data in the buffers. bbufIsDirty = false; bbufIsReadable = true; bbufpos = fcn.position(); reader = new RandomAccessReader(); writer = new RandomAccessWriter(); inputStream = new RandomAccessInputStream(); outputStream = new RandomAccessOutputStream(); } public void setEncoding(String encoding) { if (encoding == null) { cset = Charset.defaultCharset(); } else { try { cset = Charset.forName(encoding); } catch (UnsupportedCharsetException e) { error(new SimpleError("Undefined encoding: " + encoding)); } } cdec = cset.newDecoder(); cdec.onMalformedInput(CodingErrorAction.REPLACE); cdec.onUnmappableCharacter(CodingErrorAction.REPLACE); cenc = cset.newEncoder(); } public Writer getWriter() { return writer; } public PushbackReader getReader() { return reader; } public PushbackInputStream getInputStream() { return inputStream; } public OutputStream getOutputStream() { return outputStream; } public final void close() throws IOException { internalFlush(true); fcn.close(); } public final void flush() throws IOException { internalFlush(false); } private final boolean ensureReadBbuf(boolean force) throws IOException { boolean bufReady = true; if ((bbuf.remaining() == 0) || force || ! bbufIsReadable) { // need to read from the file. if (bbufIsDirty) { flushBbuf(false); bbuf.clear(); bbufIsReadable = false; } else { int bbufEnd = bbufIsReadable ? bbuf.limit() : bbuf.position(); fcn.position(bbufpos + bbufEnd); bbufpos += bbuf.position(); if (bbufIsReadable) { bbuf.compact(); bbufIsReadable = false; } else //must discard the junk bytes after bbuf.position() bbuf.clear(); } bufReady = (fcn.read(bbuf) != -1); bbuf.flip(); bbufIsReadable = true; } return bufReady; } final int read(char[] cb, int off, int len) throws IOException { CharBuffer cbuf = CharBuffer.wrap(cb, off, len); boolean decodeWasUnderflow = false; boolean atEof = false; while ((cbuf.remaining() > 0) && ! atEof) { int oldRemaining = cbuf.remaining(); atEof = ! ensureReadBbuf(decodeWasUnderflow); CoderResult r = cdec.decode(bbuf, cbuf, atEof ); if (oldRemaining == cbuf.remaining() && CoderResult.OVERFLOW == r) { // if this happens, the decoding failed // but the bufs didn't advance. Advance // them manually and do manual replacing, // otherwise we loop endlessly. This occurs // at least when parsing latin1 files with // lowercase o-umlauts in them // Note that this is at the moment copy-paste // with DecodingReader.read() cbuf.put('?'); bbuf.get(); } decodeWasUnderflow = (CoderResult.UNDERFLOW == r); } if (cbuf.remaining() == len) { return -1; } else { return len - cbuf.remaining(); } } final void write(char[] cb, int off, int len) throws IOException { CharBuffer cbuf = CharBuffer.wrap(cb, off, len); encodeAndWrite(cbuf, false, false); } private final void internalFlush(boolean endOfFile) throws IOException { if (endOfFile) { CharBuffer cbuf = CharBuffer.allocate(0); encodeAndWrite(cbuf, true, endOfFile); } else { flushBbuf(false); } } private final void encodeAndWrite(CharBuffer cbuf, boolean flush, boolean endOfFile) throws IOException { while (cbuf.remaining() > 0) { CoderResult r = cenc.encode(cbuf, bbuf, endOfFile); bbufIsDirty = true; if (CoderResult.OVERFLOW == r || bbuf.remaining() == 0) { flushBbuf(false); bbuf.clear(); bbufIsReadable = false; } if (r.isUnmappable()) { throw new RACFUnmappableCharacterException(cbuf.position(), cbuf.charAt(cbuf.position()), cset.name()); } if (r.isMalformed()) { // We don't really expect Malformed, but not handling it // will cause an infinite loop if we don't... throw new RACFMalformedInputException(cbuf.position(), cbuf.charAt(cbuf.position()), cset.name()); } // UNDERFLOW is the normal condition where cbuf runs out // before bbuf is filled. } if (bbuf.position() > 0 && bbufIsDirty && flush) { flushBbuf(false); } } public final void position(long newPosition) throws IOException { flushBbuf(true); long bbufend = bbufpos // in case bbuf is readable, its contents is valid + (bbufIsReadable ? bbuf.limit() : bbuf.position()); // beyond position() if (newPosition >= bbufpos && newPosition < bbufend) { // near seek. within existing data of bbuf. if (!bbufIsReadable) { //rewinding. keep tail buffered. bbuf.limit(bbuf.position()); bbufIsReadable = true; } bbuf.position((int)(newPosition - bbufpos)); } else { fcn.position(newPosition); // far seek; discard the buffer (it's already cleared) bbuf.clear(); bbuf.flip(); // "there is no useful data on this buffer yet." bbufIsReadable = true; bbufpos = newPosition; } } public final long position() throws IOException { return bbufpos + bbuf.position(); // the logical position within the file. } public final long length() throws IOException { flushBbuf(true); return fcn.size(); } final void flushBbuf(boolean commitOnly) throws IOException { if (commitOnly && !bbufIsDirty) return; //otherwise, we do at least need to increase bbufpos fcn.position(bbufpos); // if the buffer is dirty, the modifications have to be // before position(): before re-positioning, this.position() // calls this function. if (commitOnly) { ByteBuffer dup = bbuf.duplicate(); dup.flip(); fcn.write(dup); //ideally, should restore fcn.position(). but don't for performance. // fcn.position(fcn.position()-dup.position()); bbufIsDirty = false; //this fixed stas's bug, but not mine. return; } if (bbufIsDirty) { bbuf.flip(); fcn.write(bbuf); } bbufpos += bbuf.position(); bbuf.clear(); bbuf.flip(); // there's no useable data in this buffer bbufIsDirty = false; bbufIsReadable = true; } public final int read(byte[] b, int off, int len) throws IOException { int pos = off; boolean atEof = false; while (pos - off < len && ! atEof) { atEof = ! ensureReadBbuf(false); int want = Math.min(off + len - pos, bbuf.remaining()); bbuf.get(b, pos, want); pos += want; } return pos - off; } // a method corresponding to the good ol' ungetc in C. // This function may fail when using (combined) character codes that use // escape sequences to switch between sub-codes. // ASCII, ISO-8859 series, any 8bit code are OK, all unicode variations are OK, // but applications of the ISO-2022 encoding framework can have trouble. // Example of such code is ISO-2022-JP which is used in Japanese e-mail. private CharBuffer singleCharBuf; private ByteBuffer shortByteBuf; public final void unreadChar(char c) throws IOException { // algorithm : // 1. encode c into bytes, to find out how many bytes it corresponds to // 2. move the position backwards that many bytes. // ** we stop here. Don't bother to write the bytes to the buffer, // assuming that it is the same as the original data. // If we allow to write back different characters, the buffer must get 'dirty' // but that would require read/write permissions on files you use unreadChar, // even if you are just reading for some tokenizer. // // So we don't do the following. // 3. write the bytes. // 4. move the position back again. if (singleCharBuf == null) { singleCharBuf = CharBuffer.allocate(1); shortByteBuf = ByteBuffer.allocate((int)cenc.maxBytesPerChar()); } singleCharBuf.clear(); singleCharBuf.append(c); singleCharBuf.flip(); shortByteBuf.clear(); cenc.encode(singleCharBuf, shortByteBuf, false); int n = shortByteBuf.position(); long pos = position() - n; position(pos); } public final void unreadByte(byte b) throws IOException { long pos = position() - 1; position(pos); } final void write(byte[] b, int off, int len) throws IOException { int pos = off; while (pos < off + len) { if (bbuf.remaining() == 0) { flushBbuf(false); bbuf.clear(); bbufIsReadable = false; } int thisBatchLen = Math.min(off + len - pos, bbuf.remaining()); bbuf.put(b, pos, thisBatchLen); pos += thisBatchLen; bbufIsDirty = true; } } }