001    /*
002     *  Licensed to the Apache Software Foundation (ASF) under one or more
003     *  contributor license agreements.  See the NOTICE file distributed with
004     *  this work for additional information regarding copyright ownership.
005     *  The ASF licenses this file to You under the Apache License, Version 2.0
006     *  (the "License"); you may not use this file except in compliance with
007     *  the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     *  Unless required by applicable law or agreed to in writing, software
012     *  distributed under the License is distributed on an "AS IS" BASIS,
013     *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     *  See the License for the specific language governing permissions and
015     *  limitations under the License.
016     *
017     */
018    
019    /*
020     * This package is based on the work done by Timothy Gerard Endres
021     * (time@ice.com) to whom the Ant project is very grateful for his great code.
022     */
023    
024    package org.apache.commons.compress.archivers.tar;
025    
026    import java.io.IOException;
027    import java.io.InputStream;
028    import java.io.InputStreamReader;
029    import java.io.Reader;
030    import java.util.HashMap;
031    import java.util.Map;
032    import java.util.Map.Entry;
033    
034    import org.apache.commons.compress.archivers.ArchiveEntry;
035    import org.apache.commons.compress.archivers.ArchiveInputStream;
036    import org.apache.commons.compress.utils.ArchiveUtils;
037    
038    /**
039     * The TarInputStream reads a UNIX tar archive as an InputStream.
040     * methods are provided to position at each successive entry in
041     * the archive, and the read each entry as a normal input stream
042     * using read().
043     * @NotThreadSafe
044     */
045    public class TarArchiveInputStream extends ArchiveInputStream {
046        private static final int SMALL_BUFFER_SIZE = 256;
047        private static final int BUFFER_SIZE = 8 * 1024;
048    
049        private boolean hasHitEOF;
050        private long entrySize;
051        private long entryOffset;
052        private byte[] readBuf;
053        protected final TarBuffer buffer;
054        private TarArchiveEntry currEntry;
055    
056        /**
057         * Constructor for TarInputStream.
058         * @param is the input stream to use
059         */
060        public TarArchiveInputStream(InputStream is) {
061            this(is, TarBuffer.DEFAULT_BLKSIZE, TarBuffer.DEFAULT_RCDSIZE);
062        }
063    
064        /**
065         * Constructor for TarInputStream.
066         * @param is the input stream to use
067         * @param blockSize the block size to use
068         */
069        public TarArchiveInputStream(InputStream is, int blockSize) {
070            this(is, blockSize, TarBuffer.DEFAULT_RCDSIZE);
071        }
072    
073        /**
074         * Constructor for TarInputStream.
075         * @param is the input stream to use
076         * @param blockSize the block size to use
077         * @param recordSize the record size to use
078         */
079        public TarArchiveInputStream(InputStream is, int blockSize, int recordSize) {
080            this.buffer = new TarBuffer(is, blockSize, recordSize);
081            this.readBuf = null;
082            this.hasHitEOF = false;
083        }
084    
085        /**
086         * Closes this stream. Calls the TarBuffer's close() method.
087         * @throws IOException on error
088         */
089        @Override
090        public void close() throws IOException {
091            buffer.close();
092        }
093    
094        /**
095         * Get the record size being used by this stream's TarBuffer.
096         *
097         * @return The TarBuffer record size.
098         */
099        public int getRecordSize() {
100            return buffer.getRecordSize();
101        }
102    
103        /**
104         * Get the available data that can be read from the current
105         * entry in the archive. This does not indicate how much data
106         * is left in the entire archive, only in the current entry.
107         * This value is determined from the entry's size header field
108         * and the amount of data already read from the current entry.
109         * Integer.MAX_VALUE is returen in case more than Integer.MAX_VALUE
110         * bytes are left in the current entry in the archive.
111         *
112         * @return The number of available bytes for the current entry.
113         * @throws IOException for signature
114         */
115        @Override
116        public int available() throws IOException {
117            if (entrySize - entryOffset > Integer.MAX_VALUE) {
118                return Integer.MAX_VALUE;
119            }
120            return (int) (entrySize - entryOffset);
121        }
122    
123        /**
124         * Skip bytes in the input buffer. This skips bytes in the
125         * current entry's data, not the entire archive, and will
126         * stop at the end of the current entry's data if the number
127         * to skip extends beyond that point.
128         *
129         * @param numToSkip The number of bytes to skip.
130         * @return the number actually skipped
131         * @throws IOException on error
132         */
133        @Override
134        public long skip(long numToSkip) throws IOException {
135            // REVIEW
136            // This is horribly inefficient, but it ensures that we
137            // properly skip over bytes via the TarBuffer...
138            //
139            byte[] skipBuf = new byte[BUFFER_SIZE];
140            long skip = numToSkip;
141            while (skip > 0) {
142                int realSkip = (int) (skip > skipBuf.length ? skipBuf.length : skip);
143                int numRead = read(skipBuf, 0, realSkip);
144                if (numRead == -1) {
145                    break;
146                }
147                skip -= numRead;
148            }
149            return (numToSkip - skip);
150        }
151    
152        /**
153         * Since we do not support marking just yet, we do nothing.
154         */
155        @Override
156        public synchronized void reset() {
157        }
158    
159        /**
160         * Get the next entry in this tar archive. This will skip
161         * over any remaining data in the current entry, if there
162         * is one, and place the input stream at the header of the
163         * next entry, and read the header and instantiate a new
164         * TarEntry from the header bytes and return that entry.
165         * If there are no more entries in the archive, null will
166         * be returned to indicate that the end of the archive has
167         * been reached.
168         *
169         * @return The next TarEntry in the archive, or null.
170         * @throws IOException on error
171         */
172        public TarArchiveEntry getNextTarEntry() throws IOException {
173            if (hasHitEOF) {
174                return null;
175            }
176    
177            if (currEntry != null) {
178                long numToSkip = entrySize - entryOffset;
179    
180                while (numToSkip > 0) {
181                    long skipped = skip(numToSkip);
182                    if (skipped <= 0) {
183                        throw new RuntimeException("failed to skip current tar entry");
184                    }
185                    numToSkip -= skipped;
186                }
187    
188                readBuf = null;
189            }
190    
191            byte[] headerBuf = getRecord();
192    
193            if (hasHitEOF) {
194                currEntry = null;
195                return null;
196            }
197    
198            currEntry = new TarArchiveEntry(headerBuf);
199            entryOffset = 0;
200            entrySize = currEntry.getSize();
201    
202            if (currEntry.isGNULongNameEntry()) {
203                // read in the name
204                StringBuffer longName = new StringBuffer();
205                byte[] buf = new byte[SMALL_BUFFER_SIZE];
206                int length = 0;
207                while ((length = read(buf)) >= 0) {
208                    longName.append(new String(buf, 0, length));
209                }
210                getNextEntry();
211                if (currEntry == null) {
212                    // Bugzilla: 40334
213                    // Malformed tar file - long entry name not followed by entry
214                    return null;
215                }
216                // remove trailing null terminator
217                if (longName.length() > 0
218                    && longName.charAt(longName.length() - 1) == 0) {
219                    longName.deleteCharAt(longName.length() - 1);
220                }
221                currEntry.setName(longName.toString());
222            }
223    
224            if (currEntry.isPaxHeader()){ // Process Pax headers
225                paxHeaders();
226            }
227    
228            if (currEntry.isGNUSparse()){ // Process sparse files
229                readGNUSparse();
230            }
231    
232            return currEntry;
233        }
234    
235        /**
236         * Get the next record in this tar archive. This will skip
237         * over any remaining data in the current entry, if there
238         * is one, and place the input stream at the header of the
239         * next entry.
240         * If there are no more entries in the archive, null will
241         * be returned to indicate that the end of the archive has
242         * been reached.
243         *
244         * @return The next header in the archive, or null.
245         * @throws IOException on error
246         */
247        private byte[] getRecord() throws IOException {
248            if (hasHitEOF) {
249                return null;
250            }
251    
252            byte[] headerBuf = buffer.readRecord();
253    
254            if (headerBuf == null) {
255                hasHitEOF = true;
256            } else if (buffer.isEOFRecord(headerBuf)) {
257                hasHitEOF = true;
258            }
259    
260            return hasHitEOF ? null : headerBuf;
261        }
262    
263        private void paxHeaders() throws IOException{
264            Reader br = new InputStreamReader(this, "UTF-8") {
265                    @Override
266                    public void close() {
267                        // make sure GC doesn't close "this" before we are done
268                    }
269                };
270            Map<String, String> headers = new HashMap<String, String>();
271            // Format is "length keyword=value\n";
272            try {
273                while(true){ // get length
274                    int ch;
275                    int len = 0;
276                    int read = 0;
277                    while((ch = br.read()) != -1){
278                        read++;
279                        if (ch == ' '){ // End of length string
280                            // Get keyword
281                            StringBuffer sb = new StringBuffer();
282                            while((ch = br.read()) != -1){
283                                read++;
284                                if (ch == '='){ // end of keyword
285                                    String keyword = sb.toString();
286                                    // Get rest of entry
287                                    char[] cbuf = new char[len-read];
288                                    int got = br.read(cbuf);
289                                    if (got != len - read){
290                                        throw new IOException("Failed to read "
291                                                              + "Paxheader. Expected "
292                                                              + (len - read)
293                                                              + " chars, read "
294                                                              + got);
295                                    }
296                                    // Drop trailing NL
297                                    String value = new String(cbuf, 0,
298                                                              len - read - 1);
299                                    headers.put(keyword, value);
300                                    break;
301                                }
302                                sb.append((char) ch);
303                            }
304                            break; // Processed single header
305                        }
306                        len *= 10;
307                        len += ch - '0';
308                    }
309                    if (ch == -1){ // EOF
310                        break;
311                    }
312                }
313            } finally {
314                // NO-OP but makes FindBugs happy
315                br.close();
316            }
317    
318            getNextEntry(); // Get the actual file entry
319            /*
320             * The following headers are defined for Pax.
321             * atime, ctime, mtime, charset: cannot use these without changing TarArchiveEntry fields
322             * comment
323             * gid, gname
324             * linkpath
325             * size
326             * uid,uname
327             */
328            for (Entry<String, String> ent : headers.entrySet()){
329                String key = ent.getKey();
330                String val = ent.getValue();
331                if ("path".equals(key)){
332                    currEntry.setName(val);
333                } else if ("linkpath".equals(key)){
334                    currEntry.setLinkName(val);
335                } else if ("gid".equals(key)){
336                    currEntry.setGroupId(Integer.parseInt(val));
337                } else if ("gname".equals(key)){
338                    currEntry.setGroupName(val);
339                } else if ("uid".equals(key)){
340                    currEntry.setUserId(Integer.parseInt(val));
341                } else if ("uname".equals(key)){
342                    currEntry.setUserName(val);
343                } else if ("size".equals(key)){
344                    currEntry.setSize(Long.parseLong(val));
345                }
346            }
347        }
348    
349        /**
350         * Adds the sparse chunks from the current entry to the sparse chunks,
351         * including any additional sparse entries following the current entry.
352         * 
353         * @throws IOException on error 
354         * 
355         * @todo Sparse files get not yet really processed. 
356         */
357        private void readGNUSparse() throws IOException {
358            /* we do not really process sparse files yet
359            sparses = new ArrayList();
360            sparses.addAll(currEntry.getSparses());
361            */
362            if (currEntry.isExtended()) {
363                TarArchiveSparseEntry entry;
364                do {
365                    byte[] headerBuf = getRecord();
366                    if (hasHitEOF) {
367                        currEntry = null;
368                        break;
369                    }
370                    entry = new TarArchiveSparseEntry(headerBuf);
371                    /* we do not really process sparse files yet
372                    sparses.addAll(entry.getSparses());
373                    */
374                } while (entry.isExtended());
375            }
376        }
377    
378        @Override
379        public ArchiveEntry getNextEntry() throws IOException {
380            return getNextTarEntry();
381        }
382    
383        /**
384         * Reads bytes from the current tar archive entry.
385         *
386         * This method is aware of the boundaries of the current
387         * entry in the archive and will deal with them as if they
388         * were this stream's start and EOF.
389         *
390         * @param buf The buffer into which to place bytes read.
391         * @param offset The offset at which to place bytes read.
392         * @param numToRead The number of bytes to read.
393         * @return The number of bytes read, or -1 at EOF.
394         * @throws IOException on error
395         */
396        @Override
397        public int read(byte[] buf, int offset, int numToRead) throws IOException {
398            int totalRead = 0;
399    
400            if (entryOffset >= entrySize) {
401                return -1;
402            }
403    
404            if ((numToRead + entryOffset) > entrySize) {
405                numToRead = (int) (entrySize - entryOffset);
406            }
407    
408            if (readBuf != null) {
409                int sz = (numToRead > readBuf.length) ? readBuf.length
410                    : numToRead;
411    
412                System.arraycopy(readBuf, 0, buf, offset, sz);
413    
414                if (sz >= readBuf.length) {
415                    readBuf = null;
416                } else {
417                    int newLen = readBuf.length - sz;
418                    byte[] newBuf = new byte[newLen];
419    
420                    System.arraycopy(readBuf, sz, newBuf, 0, newLen);
421    
422                    readBuf = newBuf;
423                }
424    
425                totalRead += sz;
426                numToRead -= sz;
427                offset += sz;
428            }
429    
430            while (numToRead > 0) {
431                byte[] rec = buffer.readRecord();
432    
433                if (rec == null) {
434                    // Unexpected EOF!
435                    throw new IOException("unexpected EOF with " + numToRead
436                                          + " bytes unread. Occured at byte: " + getBytesRead());
437                }
438                count(rec.length);
439                int sz = numToRead;
440                int recLen = rec.length;
441    
442                if (recLen > sz) {
443                    System.arraycopy(rec, 0, buf, offset, sz);
444    
445                    readBuf = new byte[recLen - sz];
446    
447                    System.arraycopy(rec, sz, readBuf, 0, recLen - sz);
448                } else {
449                    sz = recLen;
450    
451                    System.arraycopy(rec, 0, buf, offset, recLen);
452                }
453    
454                totalRead += sz;
455                numToRead -= sz;
456                offset += sz;
457            }
458    
459            entryOffset += totalRead;
460    
461            return totalRead;
462        }
463    
464        /**
465         * Whether this class is able to read the given entry.
466         *
467         * <p>May return false if the current entry is a sparse file.</p>
468         */
469        @Override
470        public boolean canReadEntryData(ArchiveEntry ae) {
471            if (ae instanceof TarArchiveEntry) {
472                TarArchiveEntry te = (TarArchiveEntry) ae;
473                return !te.isGNUSparse();
474            }
475            return false;
476        }
477    
478        protected final TarArchiveEntry getCurrentEntry() {
479            return currEntry;
480        }
481    
482        protected final void setCurrentEntry(TarArchiveEntry e) {
483            currEntry = e;
484        }
485    
486        protected final boolean isAtEOF() {
487            return hasHitEOF;
488        }
489    
490        protected final void setAtEOF(boolean b) {
491            hasHitEOF = b;
492        }
493    
494        /**
495         * Checks if the signature matches what is expected for a tar file.
496         * 
497         * @param signature
498         *            the bytes to check
499         * @param length
500         *            the number of bytes to check
501         * @return true, if this stream is a tar archive stream, false otherwise
502         */
503        public static boolean matches(byte[] signature, int length) {
504            if (length < TarConstants.VERSION_OFFSET+TarConstants.VERSIONLEN) {
505                return false;
506            }
507    
508            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_POSIX,
509                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
510                &&
511                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_POSIX,
512                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
513                    ){
514                return true;
515            }
516            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_GNU,
517                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
518                &&
519                (
520                 ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_SPACE,
521                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
522                ||
523                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_GNU_ZERO,
524                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
525                )
526                    ){
527                return true;
528            }
529            // COMPRESS-107 - recognise Ant tar files
530            if (ArchiveUtils.matchAsciiBuffer(TarConstants.MAGIC_ANT,
531                    signature, TarConstants.MAGIC_OFFSET, TarConstants.MAGICLEN)
532                &&
533                ArchiveUtils.matchAsciiBuffer(TarConstants.VERSION_ANT,
534                    signature, TarConstants.VERSION_OFFSET, TarConstants.VERSIONLEN)
535                    ){
536                return true;
537            }
538            return false;
539        }
540    
541    }