001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapreduce.lib.input;
020    
021    import java.io.IOException;
022    import java.util.ArrayList;
023    import java.util.Collection;
024    import java.util.LinkedList;
025    import java.util.HashSet;
026    import java.util.List;
027    import java.util.HashMap;
028    import java.util.Set;
029    import java.util.Iterator;
030    import java.util.Map;
031    
032    import org.apache.hadoop.classification.InterfaceAudience;
033    import org.apache.hadoop.classification.InterfaceStability;
034    import org.apache.hadoop.conf.Configuration;
035    import org.apache.hadoop.fs.FileSystem;
036    import org.apache.hadoop.fs.FileUtil;
037    import org.apache.hadoop.fs.Path;
038    import org.apache.hadoop.fs.BlockLocation;
039    import org.apache.hadoop.fs.FileStatus;
040    import org.apache.hadoop.fs.PathFilter;
041    import org.apache.hadoop.io.compress.CompressionCodec;
042    import org.apache.hadoop.io.compress.CompressionCodecFactory;
043    import org.apache.hadoop.io.compress.SplittableCompressionCodec;
044    import org.apache.hadoop.mapreduce.InputFormat;
045    import org.apache.hadoop.mapreduce.InputSplit;
046    import org.apache.hadoop.mapreduce.JobContext;
047    import org.apache.hadoop.mapreduce.RecordReader;
048    import org.apache.hadoop.mapreduce.TaskAttemptContext;
049    import org.apache.hadoop.net.NodeBase;
050    import org.apache.hadoop.net.NetworkTopology;
051    
052    import com.google.common.annotations.VisibleForTesting;
053    
054    /**
055     * An abstract {@link InputFormat} that returns {@link CombineFileSplit}'s in 
056     * {@link InputFormat#getSplits(JobContext)} method. 
057     * 
058     * Splits are constructed from the files under the input paths. 
059     * A split cannot have files from different pools.
060     * Each split returned may contain blocks from different files.
061     * If a maxSplitSize is specified, then blocks on the same node are
062     * combined to form a single split. Blocks that are left over are
063     * then combined with other blocks in the same rack. 
064     * If maxSplitSize is not specified, then blocks from the same rack
065     * are combined in a single split; no attempt is made to create
066     * node-local splits.
067     * If the maxSplitSize is equal to the block size, then this class
068     * is similar to the default splitting behavior in Hadoop: each
069     * block is a locally processed split.
070     * Subclasses implement 
071     * {@link InputFormat#createRecordReader(InputSplit, TaskAttemptContext)}
072     * to construct <code>RecordReader</code>'s for 
073     * <code>CombineFileSplit</code>'s.
074     * 
075     * @see CombineFileSplit
076     */
077    @InterfaceAudience.Public
078    @InterfaceStability.Stable
079    public abstract class CombineFileInputFormat<K, V>
080      extends FileInputFormat<K, V> {
081      
082      public static final String SPLIT_MINSIZE_PERNODE = 
083        "mapreduce.input.fileinputformat.split.minsize.per.node";
084      public static final String SPLIT_MINSIZE_PERRACK = 
085        "mapreduce.input.fileinputformat.split.minsize.per.rack";
086      // ability to limit the size of a single split
087      private long maxSplitSize = 0;
088      private long minSplitSizeNode = 0;
089      private long minSplitSizeRack = 0;
090    
091      // A pool of input paths filters. A split cannot have blocks from files
092      // across multiple pools.
093      private ArrayList<MultiPathFilter> pools = new  ArrayList<MultiPathFilter>();
094    
095      // mapping from a rack name to the set of Nodes in the rack 
096      private HashMap<String, Set<String>> rackToNodes = 
097                                new HashMap<String, Set<String>>();
098      /**
099       * Specify the maximum size (in bytes) of each split. Each split is
100       * approximately equal to the specified size.
101       */
102      protected void setMaxSplitSize(long maxSplitSize) {
103        this.maxSplitSize = maxSplitSize;
104      }
105    
106      /**
107       * Specify the minimum size (in bytes) of each split per node.
108       * This applies to data that is left over after combining data on a single
109       * node into splits that are of maximum size specified by maxSplitSize.
110       * This leftover data will be combined into its own split if its size
111       * exceeds minSplitSizeNode.
112       */
113      protected void setMinSplitSizeNode(long minSplitSizeNode) {
114        this.minSplitSizeNode = minSplitSizeNode;
115      }
116    
117      /**
118       * Specify the minimum size (in bytes) of each split per rack.
119       * This applies to data that is left over after combining data on a single
120       * rack into splits that are of maximum size specified by maxSplitSize.
121       * This leftover data will be combined into its own split if its size
122       * exceeds minSplitSizeRack.
123       */
124      protected void setMinSplitSizeRack(long minSplitSizeRack) {
125        this.minSplitSizeRack = minSplitSizeRack;
126      }
127    
128      /**
129       * Create a new pool and add the filters to it.
130       * A split cannot have files from different pools.
131       */
132      protected void createPool(List<PathFilter> filters) {
133        pools.add(new MultiPathFilter(filters));
134      }
135    
136      /**
137       * Create a new pool and add the filters to it. 
138       * A pathname can satisfy any one of the specified filters.
139       * A split cannot have files from different pools.
140       */
141      protected void createPool(PathFilter... filters) {
142        MultiPathFilter multi = new MultiPathFilter();
143        for (PathFilter f: filters) {
144          multi.add(f);
145        }
146        pools.add(multi);
147      }
148      
149      @Override
150      protected boolean isSplitable(JobContext context, Path file) {
151        final CompressionCodec codec =
152          new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
153        if (null == codec) {
154          return true;
155        }
156        return codec instanceof SplittableCompressionCodec;
157      }
158    
159      /**
160       * default constructor
161       */
162      public CombineFileInputFormat() {
163      }
164    
165      @Override
166      public List<InputSplit> getSplits(JobContext job) 
167        throws IOException {
168        long minSizeNode = 0;
169        long minSizeRack = 0;
170        long maxSize = 0;
171        Configuration conf = job.getConfiguration();
172    
173        // the values specified by setxxxSplitSize() takes precedence over the
174        // values that might have been specified in the config
175        if (minSplitSizeNode != 0) {
176          minSizeNode = minSplitSizeNode;
177        } else {
178          minSizeNode = conf.getLong(SPLIT_MINSIZE_PERNODE, 0);
179        }
180        if (minSplitSizeRack != 0) {
181          minSizeRack = minSplitSizeRack;
182        } else {
183          minSizeRack = conf.getLong(SPLIT_MINSIZE_PERRACK, 0);
184        }
185        if (maxSplitSize != 0) {
186          maxSize = maxSplitSize;
187        } else {
188          maxSize = conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
189        }
190        if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
191          throw new IOException("Minimum split size pernode " + minSizeNode +
192                                " cannot be larger than maximum split size " +
193                                maxSize);
194        }
195        if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
196          throw new IOException("Minimum split size per rack" + minSizeRack +
197                                " cannot be larger than maximum split size " +
198                                maxSize);
199        }
200        if (minSizeRack != 0 && minSizeNode > minSizeRack) {
201          throw new IOException("Minimum split size per node" + minSizeNode +
202                                " cannot be smaller than minimum split " +
203                                "size per rack " + minSizeRack);
204        }
205    
206        // all the files in input set
207        Path[] paths = FileUtil.stat2Paths(
208                         listStatus(job).toArray(new FileStatus[0]));
209        List<InputSplit> splits = new ArrayList<InputSplit>();
210        if (paths.length == 0) {
211          return splits;    
212        }
213    
214        // Convert them to Paths first. This is a costly operation and 
215        // we should do it first, otherwise we will incur doing it multiple
216        // times, one time each for each pool in the next loop.
217        List<Path> newpaths = new LinkedList<Path>();
218        for (int i = 0; i < paths.length; i++) {
219          FileSystem fs = paths[i].getFileSystem(conf);
220          Path p = fs.makeQualified(paths[i]);
221          newpaths.add(p);
222        }
223    
224        // In one single iteration, process all the paths in a single pool.
225        // Processing one pool at a time ensures that a split contains paths
226        // from a single pool only.
227        for (MultiPathFilter onepool : pools) {
228          ArrayList<Path> myPaths = new ArrayList<Path>();
229          
230          // pick one input path. If it matches all the filters in a pool,
231          // add it to the output set
232          for (Iterator<Path> iter = newpaths.iterator(); iter.hasNext();) {
233            Path p = iter.next();
234            if (onepool.accept(p)) {
235              myPaths.add(p); // add it to my output set
236              iter.remove();
237            }
238          }
239          // create splits for all files in this pool.
240          getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), 
241                        maxSize, minSizeNode, minSizeRack, splits);
242        }
243    
244        // create splits for all files that are not in any pool.
245        getMoreSplits(job, newpaths.toArray(new Path[newpaths.size()]), 
246                      maxSize, minSizeNode, minSizeRack, splits);
247    
248        // free up rackToNodes map
249        rackToNodes.clear();
250        return splits;    
251      }
252    
253      /**
254       * Return all the splits in the specified set of paths
255       */
256      private void getMoreSplits(JobContext job, Path[] paths, 
257                                 long maxSize, long minSizeNode, long minSizeRack,
258                                 List<InputSplit> splits)
259        throws IOException {
260        Configuration conf = job.getConfiguration();
261    
262        // all blocks for all the files in input set
263        OneFileInfo[] files;
264      
265        // mapping from a rack name to the list of blocks it has
266        HashMap<String, List<OneBlockInfo>> rackToBlocks = 
267                                  new HashMap<String, List<OneBlockInfo>>();
268    
269        // mapping from a block to the nodes on which it has replicas
270        HashMap<OneBlockInfo, String[]> blockToNodes = 
271                                  new HashMap<OneBlockInfo, String[]>();
272    
273        // mapping from a node to the list of blocks that it contains
274        HashMap<String, List<OneBlockInfo>> nodeToBlocks = 
275                                  new HashMap<String, List<OneBlockInfo>>();
276        
277        files = new OneFileInfo[paths.length];
278        if (paths.length == 0) {
279          return; 
280        }
281    
282        // populate all the blocks for all files
283        long totLength = 0;
284        for (int i = 0; i < paths.length; i++) {
285          files[i] = new OneFileInfo(paths[i], conf, isSplitable(job, paths[i]),
286                                     rackToBlocks, blockToNodes, nodeToBlocks,
287                                     rackToNodes, maxSize);
288          totLength += files[i].getLength();
289        }
290        createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength, 
291                     maxSize, minSizeNode, minSizeRack, splits);
292      }
293    
294      @VisibleForTesting
295      void createSplits(HashMap<String, List<OneBlockInfo>> nodeToBlocks,
296                         HashMap<OneBlockInfo, String[]> blockToNodes,
297                         HashMap<String, List<OneBlockInfo>> rackToBlocks,
298                         long totLength,
299                         long maxSize,
300                         long minSizeNode,
301                         long minSizeRack,
302                         List<InputSplit> splits                     
303                        ) {
304        ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
305        Set<String> nodes = new HashSet<String>();
306        long curSplitSize = 0;
307        
308        int numNodes = nodeToBlocks.size();
309        long totalLength = totLength;
310    
311        while(true) {
312          // it is allowed for maxSize to be 0. Disable smoothing load for such cases
313          int avgSplitsPerNode = maxSize > 0 && numNodes > 0 ?
314                                            ((int) (totalLength/maxSize))/numNodes
315                                            : Integer.MAX_VALUE;
316          int maxSplitsByNodeOnly = (avgSplitsPerNode > 0) ? avgSplitsPerNode : 1;
317          numNodes = 0;
318    
319          // process all nodes and create splits that are local to a node.
320          for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = nodeToBlocks
321              .entrySet().iterator(); iter.hasNext();) {
322            Map.Entry<String, List<OneBlockInfo>> one = iter.next();
323            nodes.add(one.getKey());
324            List<OneBlockInfo> blocksInNode = one.getValue();
325    
326            // for each block, copy it into validBlocks. Delete it from
327            // blockToNodes so that the same block does not appear in
328            // two different splits.
329            int splitsInNode = 0;
330            for (OneBlockInfo oneblock : blocksInNode) {
331              if (blockToNodes.containsKey(oneblock)) {
332                validBlocks.add(oneblock);
333                blockToNodes.remove(oneblock);
334                curSplitSize += oneblock.length;
335    
336                // if the accumulated split size exceeds the maximum, then
337                // create this split.
338                if (maxSize != 0 && curSplitSize >= maxSize) {
339                  // create an input split and add it to the splits array
340                  addCreatedSplit(splits, nodes, validBlocks);
341                  totalLength -= curSplitSize;
342                  curSplitSize = 0;
343                  validBlocks.clear();
344                  splitsInNode++;
345                  if (splitsInNode == maxSplitsByNodeOnly) {
346                    // stop grouping on a node so as not to create
347                    // disproportionately more splits on a node because it happens
348                    // to have many blocks
349                    // consider only these nodes in next round of grouping because
350                    // they have leftover blocks that may need to be grouped
351                    numNodes++;
352                    break;
353                  }
354                }
355              }
356            }
357            // if there were any blocks left over and their combined size is
358            // larger than minSplitNode, then combine them into one split.
359            // Otherwise add them back to the unprocessed pool. It is likely
360            // that they will be combined with other blocks from the
361            // same rack later on.
362            if (minSizeNode != 0 && curSplitSize >= minSizeNode
363                && splitsInNode == 0) {
364              // haven't created any split on this machine. so its ok to add a
365              // smaller
366              // one for parallelism. Otherwise group it in the rack for balanced
367              // size
368              // create an input split and add it to the splits array
369              addCreatedSplit(splits, nodes, validBlocks);
370              totalLength -= curSplitSize;
371            } else {
372              for (OneBlockInfo oneblock : validBlocks) {
373                blockToNodes.put(oneblock, oneblock.hosts);
374              }
375            }
376            validBlocks.clear();
377            nodes.clear();
378            curSplitSize = 0;
379          }
380          
381          if(!(numNodes>0 && totalLength>0)) {
382            break;
383          }
384        }
385    
386        // if blocks in a rack are below the specified minimum size, then keep them
387        // in 'overflow'. After the processing of all racks is complete, these 
388        // overflow blocks will be combined into splits.
389        ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();
390        Set<String> racks = new HashSet<String>();
391    
392        // Process all racks over and over again until there is no more work to do.
393        while (blockToNodes.size() > 0) {
394    
395          // Create one split for this rack before moving over to the next rack. 
396          // Come back to this rack after creating a single split for each of the 
397          // remaining racks.
398          // Process one rack location at a time, Combine all possible blocks that
399          // reside on this rack as one split. (constrained by minimum and maximum
400          // split size).
401    
402          // iterate over all racks 
403          for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter = 
404               rackToBlocks.entrySet().iterator(); iter.hasNext();) {
405    
406            Map.Entry<String, List<OneBlockInfo>> one = iter.next();
407            racks.add(one.getKey());
408            List<OneBlockInfo> blocks = one.getValue();
409    
410            // for each block, copy it into validBlocks. Delete it from 
411            // blockToNodes so that the same block does not appear in 
412            // two different splits.
413            boolean createdSplit = false;
414            for (OneBlockInfo oneblock : blocks) {
415              if (blockToNodes.containsKey(oneblock)) {
416                validBlocks.add(oneblock);
417                blockToNodes.remove(oneblock);
418                curSplitSize += oneblock.length;
419          
420                // if the accumulated split size exceeds the maximum, then 
421                // create this split.
422                if (maxSize != 0 && curSplitSize >= maxSize) {
423                  // create an input split and add it to the splits array
424                  addCreatedSplit(splits, getHosts(racks), validBlocks);
425                  createdSplit = true;
426                  break;
427                }
428              }
429            }
430    
431            // if we created a split, then just go to the next rack
432            if (createdSplit) {
433              curSplitSize = 0;
434              validBlocks.clear();
435              racks.clear();
436              continue;
437            }
438    
439            if (!validBlocks.isEmpty()) {
440              if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
441                // if there is a minimum size specified, then create a single split
442                // otherwise, store these blocks into overflow data structure
443                addCreatedSplit(splits, getHosts(racks), validBlocks);
444              } else {
445                // There were a few blocks in this rack that 
446                    // remained to be processed. Keep them in 'overflow' block list. 
447                    // These will be combined later.
448                overflowBlocks.addAll(validBlocks);
449              }
450            }
451            curSplitSize = 0;
452            validBlocks.clear();
453            racks.clear();
454          }
455        }
456    
457        assert blockToNodes.isEmpty();
458        assert curSplitSize == 0;
459        assert validBlocks.isEmpty();
460        assert racks.isEmpty();
461    
462        // Process all overflow blocks
463        for (OneBlockInfo oneblock : overflowBlocks) {
464          validBlocks.add(oneblock);
465          curSplitSize += oneblock.length;
466    
467          // This might cause an exiting rack location to be re-added,
468          // but it should be ok.
469          for (int i = 0; i < oneblock.racks.length; i++) {
470            racks.add(oneblock.racks[i]);
471          }
472    
473          // if the accumulated split size exceeds the maximum, then 
474          // create this split.
475          if (maxSize != 0 && curSplitSize >= maxSize) {
476            // create an input split and add it to the splits array
477            addCreatedSplit(splits, getHosts(racks), validBlocks);
478            curSplitSize = 0;
479            validBlocks.clear();
480            racks.clear();
481          }
482        }
483    
484        // Process any remaining blocks, if any.
485        if (!validBlocks.isEmpty()) {
486          addCreatedSplit(splits, getHosts(racks), validBlocks);
487        }
488      }
489    
490      /**
491       * Create a single split from the list of blocks specified in validBlocks
492       * Add this new split into splitList.
493       */
494      private void addCreatedSplit(List<InputSplit> splitList, 
495                                   Collection<String> locations, 
496                                   ArrayList<OneBlockInfo> validBlocks) {
497        // create an input split
498        Path[] fl = new Path[validBlocks.size()];
499        long[] offset = new long[validBlocks.size()];
500        long[] length = new long[validBlocks.size()];
501        for (int i = 0; i < validBlocks.size(); i++) {
502          fl[i] = validBlocks.get(i).onepath; 
503          offset[i] = validBlocks.get(i).offset;
504          length[i] = validBlocks.get(i).length;
505        }
506         // add this split to the list that is returned
507        CombineFileSplit thissplit = new CombineFileSplit(fl, offset, 
508                                       length, locations.toArray(new String[0]));
509        splitList.add(thissplit); 
510      }
511    
512      /**
513       * This is not implemented yet. 
514       */
515      public abstract RecordReader<K, V> createRecordReader(InputSplit split,
516          TaskAttemptContext context) throws IOException;
517    
518      /**
519       * information about one file from the File System
520       */
521      @VisibleForTesting
522      static class OneFileInfo {
523        private long fileSize;               // size of the file
524        private OneBlockInfo[] blocks;       // all blocks in this file
525    
526        OneFileInfo(Path path, Configuration conf,
527                    boolean isSplitable,
528                    HashMap<String, List<OneBlockInfo>> rackToBlocks,
529                    HashMap<OneBlockInfo, String[]> blockToNodes,
530                    HashMap<String, List<OneBlockInfo>> nodeToBlocks,
531                    HashMap<String, Set<String>> rackToNodes,
532                    long maxSize)
533                    throws IOException {
534          this.fileSize = 0;
535    
536          // get block locations from file system
537          FileSystem fs = path.getFileSystem(conf);
538          FileStatus stat = fs.getFileStatus(path);
539          BlockLocation[] locations = fs.getFileBlockLocations(stat, 0, 
540                                                               stat.getLen());
541          // create a list of all block and their locations
542          if (locations == null) {
543            blocks = new OneBlockInfo[0];
544          } else {
545    
546            if(locations.length == 0) {
547              locations = new BlockLocation[] { new BlockLocation() };
548            }
549    
550            if (!isSplitable) {
551              // if the file is not splitable, just create the one block with
552              // full file length
553              blocks = new OneBlockInfo[1];
554              fileSize = stat.getLen();
555              blocks[0] = new OneBlockInfo(path, 0, fileSize, locations[0]
556                  .getHosts(), locations[0].getTopologyPaths());
557            } else {
558              ArrayList<OneBlockInfo> blocksList = new ArrayList<OneBlockInfo>(
559                  locations.length);
560              for (int i = 0; i < locations.length; i++) {
561                fileSize += locations[i].getLength();
562    
563                // each split can be a maximum of maxSize
564                long left = locations[i].getLength();
565                long myOffset = locations[i].getOffset();
566                long myLength = 0;
567                do {
568                  if (maxSize == 0) {
569                    myLength = left;
570                  } else {
571                    if (left > maxSize && left < 2 * maxSize) {
572                      // if remainder is between max and 2*max - then
573                      // instead of creating splits of size max, left-max we
574                      // create splits of size left/2 and left/2. This is
575                      // a heuristic to avoid creating really really small
576                      // splits.
577                      myLength = left / 2;
578                    } else {
579                      myLength = Math.min(maxSize, left);
580                    }
581                  }
582                  OneBlockInfo oneblock = new OneBlockInfo(path, myOffset,
583                      myLength, locations[i].getHosts(), locations[i]
584                          .getTopologyPaths());
585                  left -= myLength;
586                  myOffset += myLength;
587    
588                  blocksList.add(oneblock);
589                } while (left > 0);
590              }
591              blocks = blocksList.toArray(new OneBlockInfo[blocksList.size()]);
592            }
593            
594            populateBlockInfo(blocks, rackToBlocks, blockToNodes, 
595                              nodeToBlocks, rackToNodes);
596          }
597        }
598        
599        @VisibleForTesting
600        static void populateBlockInfo(OneBlockInfo[] blocks,
601                              HashMap<String, List<OneBlockInfo>> rackToBlocks,
602                              HashMap<OneBlockInfo, String[]> blockToNodes,
603                              HashMap<String, List<OneBlockInfo>> nodeToBlocks,
604                              HashMap<String, Set<String>> rackToNodes) {
605          for (OneBlockInfo oneblock : blocks) {
606            // add this block to the block --> node locations map
607            blockToNodes.put(oneblock, oneblock.hosts);
608    
609            // For blocks that do not have host/rack information,
610            // assign to default  rack.
611            String[] racks = null;
612            if (oneblock.hosts.length == 0) {
613              racks = new String[]{NetworkTopology.DEFAULT_RACK};
614            } else {
615              racks = oneblock.racks;
616            }
617    
618            // add this block to the rack --> block map
619            for (int j = 0; j < racks.length; j++) {
620              String rack = racks[j];
621              List<OneBlockInfo> blklist = rackToBlocks.get(rack);
622              if (blklist == null) {
623                blklist = new ArrayList<OneBlockInfo>();
624                rackToBlocks.put(rack, blklist);
625              }
626              blklist.add(oneblock);
627              if (!racks[j].equals(NetworkTopology.DEFAULT_RACK)) {
628                // Add this host to rackToNodes map
629                addHostToRack(rackToNodes, racks[j], oneblock.hosts[j]);
630              }
631            }
632    
633            // add this block to the node --> block map
634            for (int j = 0; j < oneblock.hosts.length; j++) {
635              String node = oneblock.hosts[j];
636              List<OneBlockInfo> blklist = nodeToBlocks.get(node);
637              if (blklist == null) {
638                blklist = new ArrayList<OneBlockInfo>();
639                nodeToBlocks.put(node, blklist);
640              }
641              blklist.add(oneblock);
642            }
643          }
644        }
645    
646        long getLength() {
647          return fileSize;
648        }
649    
650        OneBlockInfo[] getBlocks() {
651          return blocks;
652        }
653      }
654    
655      /**
656       * information about one block from the File System
657       */
658      @VisibleForTesting
659      static class OneBlockInfo {
660        Path onepath;                // name of this file
661        long offset;                 // offset in file
662        long length;                 // length of this block
663        String[] hosts;              // nodes on which this block resides
664        String[] racks;              // network topology of hosts
665    
666        OneBlockInfo(Path path, long offset, long len, 
667                     String[] hosts, String[] topologyPaths) {
668          this.onepath = path;
669          this.offset = offset;
670          this.hosts = hosts;
671          this.length = len;
672          assert (hosts.length == topologyPaths.length ||
673                  topologyPaths.length == 0);
674    
675          // if the file system does not have any rack information, then
676          // use dummy rack location.
677          if (topologyPaths.length == 0) {
678            topologyPaths = new String[hosts.length];
679            for (int i = 0; i < topologyPaths.length; i++) {
680              topologyPaths[i] = (new NodeBase(hosts[i], 
681                                  NetworkTopology.DEFAULT_RACK)).toString();
682            }
683          }
684    
685          // The topology paths have the host name included as the last 
686          // component. Strip it.
687          this.racks = new String[topologyPaths.length];
688          for (int i = 0; i < topologyPaths.length; i++) {
689            this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation();
690          }
691        }
692      }
693    
694      protected BlockLocation[] getFileBlockLocations(
695        FileSystem fs, FileStatus stat) throws IOException {
696        return fs.getFileBlockLocations(stat, 0, stat.getLen());
697      }
698    
699      private static void addHostToRack(HashMap<String, Set<String>> rackToNodes,
700                                        String rack, String host) {
701        Set<String> hosts = rackToNodes.get(rack);
702        if (hosts == null) {
703          hosts = new HashSet<String>();
704          rackToNodes.put(rack, hosts);
705        }
706        hosts.add(host);
707      }
708      
709      private Set<String> getHosts(Set<String> racks) {
710        Set<String> hosts = new HashSet<String>();
711        for (String rack : racks) {
712          if (rackToNodes.containsKey(rack)) {
713            hosts.addAll(rackToNodes.get(rack));
714          }
715        }
716        return hosts;
717      }
718      
719      /**
720       * Accept a path only if any one of filters given in the
721       * constructor do. 
722       */
723      private static class MultiPathFilter implements PathFilter {
724        private List<PathFilter> filters;
725    
726        public MultiPathFilter() {
727          this.filters = new ArrayList<PathFilter>();
728        }
729    
730        public MultiPathFilter(List<PathFilter> filters) {
731          this.filters = filters;
732        }
733    
734        public void add(PathFilter one) {
735          filters.add(one);
736        }
737    
738        public boolean accept(Path path) {
739          for (PathFilter filter : filters) {
740            if (filter.accept(path)) {
741              return true;
742            }
743          }
744          return false;
745        }
746    
747        public String toString() {
748          StringBuffer buf = new StringBuffer();
749          buf.append("[");
750          for (PathFilter f: filters) {
751            buf.append(f);
752            buf.append(",");
753          }
754          buf.append("]");
755          return buf.toString();
756        }
757      }
758    }