001    /*
002     *  Licensed to the Apache Software Foundation (ASF) under one or more
003     *  contributor license agreements.  See the NOTICE file distributed with
004     *  this work for additional information regarding copyright ownership.
005     *  The ASF licenses this file to You under the Apache License, Version 2.0
006     *  (the "License"); you may not use this file except in compliance with
007     *  the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     *  Unless required by applicable law or agreed to in writing, software
012     *  distributed under the License is distributed on an "AS IS" BASIS,
013     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     *  See the License for the specific language governing permissions and
015     *  limitations under the License.
016     *
017     */
018    
019    /*
020     * This package is based on the work done by Timothy Gerard Endres
021     * (time@ice.com) to whom the Ant project is very grateful for his great code.
022     */
023    
024    package org.apache.commons.compress.archivers.tar;
025    
026    import java.io.BufferedReader;
027    import java.io.IOException;
028    import java.io.InputStream;
029    import java.io.InputStreamReader;
030    import java.util.HashMap;
031    import java.util.Iterator;
032    import java.util.Map;
033    import java.util.Map.Entry;
034    
035    import org.apache.commons.compress.archivers.ArchiveEntry;
036    import org.apache.commons.compress.archivers.ArchiveInputStream;
037    import org.apache.commons.compress.utils.ArchiveUtils;
038    
039    /**
040     * The TarInputStream reads a UNIX tar archive as an InputStream.
041     * methods are provided to position at each successive entry in
042     * the archive, and the read each entry as a normal input stream
043     * using read().
044     * @NotThreadSafe
045     */
046    public class TarArchiveInputStream extends ArchiveInputStream {
047        private static final int SMALL_BUFFER_SIZE = 256;
048        private static final int BUFFER_SIZE = 8 * 1024;
049    
050        private boolean hasHitEOF;
051        private long entrySize;
052        private long entryOffset;
053        private byte[] readBuf;
054        protected final TarBuffer buffer;
055        private TarArchiveEntry currEntry;
056    
057        /**
058         * Constructor for TarInputStream.
059         * @param is the input stream to use
060         */
061        public TarArchiveInputStream(InputStream is) {
062            this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE);
063        }
064    
065        /**
066         * Constructor for TarInputStream.
067         * @param is the input stream to use
068         * @param blockSize the block size to use
069         */
070        public TarArchiveInputStream(InputStream is, int blockSize) {
071            this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE);
072        }
073    
074        /**
075         * Constructor for TarInputStream.
076         * @param is the input stream to use
077         * @param blockSize the block size to use
078         * @param recordSize the record size to use
079         */
080        public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
081            this.buffer = new TarBuffer(is, blockSize, recordSize);
082            this.readBuf = null;
083            this.hasHitEOF = false;
084        }
085    
086        /**
087         * Closes this stream. Calls the TarBuffer's close() method.
088         * @throws IOException on error
089         */
090        public void close() throws IOException {
091            buffer.close();
092        }
093    
094        /**
095         * Get the record size being used by this stream's TarBuffer.
096         *
097         * @return The TarBuffer record size.
098         */
099        public int getRecordSize() {
100            return buffer.getRecordSize();
101        }
102    
103        /**
104         * Get the available data that can be read from the current
105         * entry in the archive. This does not indicate how much data
106         * is left in the entire archive, only in the current entry.
107         * This value is determined from the entry's size header field
108         * and the amount of data already read from the current entry.
109         * Integer.MAX_VALUE is returen in case more than Integer.MAX_VALUE
110         * bytes are left in the current entry in the archive.
111         *
112         * @return The number of available bytes for the current entry.
113         * @throws IOException for signature
114         */
115        public int available() throws IOException {
116            if (entrySize - entryOffset > Integer.MAX_VALUE) {
117                return Integer.MAX_VALUE;
118            }
119            return (int) (entrySize - entryOffset);
120        }
121    
122        /**
123         * Skip bytes in the input buffer. This skips bytes in the
124         * current entry's data, not the entire archive, and will
125         * stop at the end of the current entry's data if the number
126         * to skip extends beyond that point.
127         *
128         * @param numToSkip The number of bytes to skip.
129         * @return the number actually skipped
130         * @throws IOException on error
131         */
132        public long skip(long numToSkip) throws IOException {
133            // REVIEW
134            // This is horribly inefficient, but it ensures that we
135            // properly skip over bytes via the TarBuffer...
136            //
137            byte[] skipBuf = new byte[BUFFER_SIZE];
138            long skip = numToSkip;
139            while (skip > 0) {
140                int realSkip = (int) (skip > skipBuf.length ? skipBuf.length : skip);
141                int numRead = read(skipBuf, 0, realSkip);
142                if (numRead == -1) {
143                    break;
144                }
145                skip -= numRead;
146            }
147            return (numToSkip - skip);
148        }
149    
150        /**
151         * Since we do not support marking just yet, we do nothing.
152         */
153        public synchronized void reset() {
154        }
155    
156        /**
157         * Get the next entry in this tar archive. This will skip
158         * over any remaining data in the current entry, if there
159         * is one, and place the input stream at the header of the
160         * next entry, and read the header and instantiate a new
161         * TarEntry from the header bytes and return that entry.
162         * If there are no more entries in the archive, null will
163         * be returned to indicate that the end of the archive has
164         * been reached.
165         *
166         * @return The next TarEntry in the archive, or null.
167         * @throws IOException on error
168         */
169        public TarArchiveEntry getNextTarEntry() throws IOException {
170            if (hasHitEOF) {
171                return null;
172            }
173    
174            if (currEntry != null) {
175                long numToSkip = entrySize - entryOffset;
176    
177                while (numToSkip > 0) {
178                    long skipped = skip(numToSkip);
179                    if (skipped <= 0) {
180                        throw new RuntimeException("failed to skip current tar entry");
181                    }
182                    numToSkip -= skipped;
183                }
184    
185                readBuf = null;
186            }
187    
188            byte[] headerBuf = getRecord();
189    
190            if (hasHitEOF) {
191                currEntry = null;
192                return null;
193            }
194    
195            currEntry = new TarArchiveEntry(headerBuf);
196            entryOffset = 0;
197            entrySize = currEntry.getSize();
198    
199            if (currEntry.isGNULongNameEntry()) {
200                // read in the name
201                StringBuffer longName = new StringBuffer();
202                byte[] buf = new byte[SMALL_BUFFER_SIZE];
203                int length = 0;
204                while ((length = read(buf)) >= 0) {
205                    longName.append(new String(buf, 0, length));
206                }
207                getNextEntry();
208                if (currEntry == null) {
209                    // Bugzilla: 40334
210                    // Malformed tar file - long entry name not followed by entry
211                    return null;
212                }
213                // remove trailing null terminator
214                if (longName.length() > 0
215                    && longName.charAt(longName.length() - 1) == 0) {
216                    longName.deleteCharAt(longName.length() - 1);
217                }
218                currEntry.setName(longName.toString());
219            }
220    
221            if (currEntry.isPaxHeader()){ // Process Pax headers
222                paxHeaders();
223            }
224    
225            if (currEntry.isGNUSparse()){ // Process sparse files
226                readGNUSparse();
227            }
228    
229            return currEntry;
230        }
231    
232        /**
233         * Get the next record in this tar archive. This will skip
234         * over any remaining data in the current entry, if there
235         * is one, and place the input stream at the header of the
236         * next entry.
237         * If there are no more entries in the archive, null will
238         * be returned to indicate that the end of the archive has
239         * been reached.
240         *
241         * @return The next header in the archive, or null.
242         * @throws IOException on error
243         */
244        private byte[] getRecord() throws IOException {
245            if (hasHitEOF) {
246                return null;
247            }
248    
249            byte[] headerBuf = buffer.readRecord();
250    
251            if (headerBuf == null) {
252                hasHitEOF = true;
253            } else if (buffer.isEOFRecord(headerBuf)) {
254                hasHitEOF = true;
255            }
256    
257            return hasHitEOF ? null : headerBuf;
258        }
259    
260        private void paxHeaders() throws IOException{
261            BufferedReader br = new BufferedReader(new InputStreamReader(this, "UTF-8"));
262            Map headers = new HashMap();
263            // Format is "length keyword=value\n";
264            while(true){ // get length
265                int ch;
266                int len=0;
267                int read=0;
268                while((ch = br.read()) != -1){
269                    read++;
270                    if (ch == ' '){ // End of length string
271                        // Get keyword
272                        StringBuffer sb = new StringBuffer();
273                        while((ch = br.read()) != -1){
274                            read++;
275                            if (ch == '='){ // end of keyword
276                                String keyword = sb.toString();
277                                // Get rest of entry
278                                char[] cbuf = new char[len-read];
279                                int got = br.read(cbuf);
280                                if (got != len-read){
281                                    throw new IOException("Failed to read Paxheader. Expected "+(len-read)+" chars, read "+got);
282                                }
283                                String value = new String(cbuf, 0 , len-read-1); // Drop trailing NL
284                                headers.put(keyword, value);
285                                break;
286                            }
287                            sb.append((char)ch);
288                        }
289                        break; // Processed single header
290                    }
291                    len *= 10;
292                    len += ch - '0';
293                }
294                if (ch == -1){ // EOF
295                    break;
296                }
297            }
298            getNextEntry(); // Get the actual file entry
299            /*
300             * The following headers are defined for Pax.
301             * atime, ctime, mtime, charset: cannot use these without changing TarArchiveEntry fields
302             * comment
303             * gid, gname
304             * linkpath
305             * size
306             * uid,uname
307             */
308            Iterator hdrs = headers.entrySet().iterator();
309            while(hdrs.hasNext()){
310                Entry ent = (Entry) hdrs.next();
311                String key = (String) ent.getKey();
312                String val = (String) ent.getValue();
313                if ("path".equals(key)){
314                    currEntry.setName(val);
315                } else if ("linkpath".equals(key)){
316                    currEntry.setLinkName(val);
317                } else if ("gid".equals(key)){
318                    currEntry.setGroupId(Integer.parseInt(val));
319                } else if ("gname".equals(key)){
320                    currEntry.setGroupName(val);
321                } else if ("uid".equals(key)){
322                    currEntry.setUserId(Integer.parseInt(val));
323                } else if ("uname".equals(key)){
324                    currEntry.setUserName(val);
325                } else if ("size".equals(key)){
326                    currEntry.setSize(Long.parseLong(val));
327                }
328            }
329        }
330    
331        /**
332         * Adds the sparse chunks from the current entry to the sparse chunks,
333         * including any additional sparse entries following the current entry.
334         * 
335         * @throws IOException on error 
336         * 
337         * @todo Sparse files get not yet really processed. 
338         */
339        private void readGNUSparse() throws IOException {
340            /* we do not really process sparse files yet
341            sparses = new ArrayList();
342            sparses.addAll(currEntry.getSparses());
343            */
344            if (currEntry.isExtended()) {
345                TarArchiveSparseEntry entry;
346                do {
347                    byte[] headerBuf = getRecord();
348                    if (hasHitEOF) {
349                        currEntry = null;
350                        break;
351                    }
352                    entry = new TarArchiveSparseEntry(headerBuf);
353                    /* we do not really process sparse files yet
354                    sparses.addAll(entry.getSparses());
355                    */
356                } while (entry.isExtended());
357            }
358        }
359    
360        public ArchiveEntry getNextEntry() throws IOException {
361            return getNextTarEntry();
362        }
363    
364        /**
365         * Reads bytes from the current tar archive entry.
366         *
367         * This method is aware of the boundaries of the current
368         * entry in the archive and will deal with them as if they
369         * were this stream's start and EOF.
370         *
371         * @param buf The buffer into which to place bytes read.
372         * @param offset The offset at which to place bytes read.
373         * @param numToRead The number of bytes to read.
374         * @return The number of bytes read, or -1 at EOF.
375         * @throws IOException on error
376         */
377        public int read(byte[] buf, int offset, int numToRead) throws IOException {
378            int totalRead = 0;
379    
380            if (entryOffset >= entrySize) {
381                return -1;
382            }
383    
384            if ((numToRead + entryOffset) > entrySize) {
385                numToRead = (int) (entrySize - entryOffset);
386            }
387    
388            if (readBuf != null) {
389                int sz = (numToRead > readBuf.length) ? readBuf.length
390                    : numToRead;
391    
392                System.arraycopy(readBuf, 0, buf, offset, sz);
393    
394                if (sz >= readBuf.length) {
395                    readBuf = null;
396                } else {
397                    int newLen = readBuf.length - sz;
398                    byte[] newBuf = new byte[newLen];
399    
400                    System.arraycopy(readBuf, sz, newBuf, 0, newLen);
401    
402                    readBuf = newBuf;
403                }
404    
405                totalRead += sz;
406                numToRead -= sz;
407                offset += sz;
408            }
409    
410            while (numToRead > 0) {
411                byte[] rec = buffer.readRecord();
412    
413                if (rec == null) {
414                    // Unexpected EOF!
415                    throw new IOException("unexpected EOF with " + numToRead
416                                          + " bytes unread. Occured at byte: " + getBytesRead());
417                }
418                count(rec.length);
419                int sz = numToRead;
420                int recLen = rec.length;
421    
422                if (recLen > sz) {
423                    System.arraycopy(rec, 0, buf, offset, sz);
424    
425                    readBuf = new byte[recLen - sz];
426    
427                    System.arraycopy(rec, sz, readBuf, 0, recLen - sz);
428                } else {
429                    sz = recLen;
430    
431                    System.arraycopy(rec, 0, buf, offset, recLen);
432                }
433    
434                totalRead += sz;
435                numToRead -= sz;
436                offset += sz;
437            }
438    
439            entryOffset += totalRead;
440    
441            return totalRead;
442        }
443    
444        /**
445         * Whether this class is able to read the given entry.
446         *
447         * <p>May return false if the current entry is a sparse file.</p>
448         */
449        public boolean canReadEntryData(ArchiveEntry ae) {
450            if (ae instanceof TarArchiveEntry) {
451                TarArchiveEntry te = (TarArchiveEntry) ae;
452                return !te.isGNUSparse();
453            }
454            return false;
455        }
456    
457        protected final TarArchiveEntry getCurrentEntry() {
458            return currEntry;
459        }
460    
461        protected final void setCurrentEntry(TarArchiveEntry e) {
462            currEntry = e;
463        }
464    
465        protected final boolean isAtEOF() {
466            return hasHitEOF;
467        }
468    
469        protected final void setAtEOF(boolean b) {
470            hasHitEOF = b;
471        }
472    
473        /**
474         * Checks if the signature matches what is expected for a tar file.
475         * 
476         * @param signature
477         *            the bytes to check
478         * @param length
479         *            the number of bytes to check
480         * @return true, if this stream is a tar archive stream, false otherwise
481         */
482        public static boolean matches(byte[] signature, int length) {
483            if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
484                return false;
485            }
486    
487            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
488                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
489                &&
490                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
491                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
492                    ){
493                return true;
494            }
495            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
496                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
497                &&
498                (
499                 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
500                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
501                ||
502                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
503                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
504                )
505                    ){
506                return true;
507            }
508            // COMPRESS-107 - recognise Ant tar files
509            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
510                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
511                &&
512                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
513                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
514                    ){
515                return true;
516            }
517            return false;
518        }
519    
520    }