View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.procedure;
19  
20  import java.io.Closeable;
21  import java.io.IOException;
22  import java.util.Collection;
23  import java.util.concurrent.ConcurrentMap;
24  import java.util.concurrent.ExecutorService;
25  import java.util.concurrent.Future;
26  import java.util.concurrent.RejectedExecutionException;
27  import java.util.concurrent.SynchronousQueue;
28  import java.util.concurrent.ThreadPoolExecutor;
29  import java.util.concurrent.TimeUnit;
30  
31  import org.apache.commons.logging.Log;
32  import org.apache.commons.logging.LogFactory;
33  import org.apache.hadoop.classification.InterfaceAudience;
34  import org.apache.hadoop.classification.InterfaceStability;
35  import org.apache.hadoop.hbase.DaemonThreadFactory;
36  import org.apache.hadoop.hbase.errorhandling.ForeignException;
37  
38  import com.google.common.collect.MapMaker;
39  
40  /**
41   * Process to kick off and manage a running {@link Subprocedure} on a member. This is the
42   * specialized part of a {@link Procedure} that actually does procedure type-specific work
43   * and reports back to the coordinator as it completes each phase.
44   * <p>
45   * If there is a connection error ({@link #controllerConnectionFailure(String, IOException)}), all
46   * currently running subprocedures are notify to failed since there is no longer a way to reach any
47   * other members or coordinators since the rpcs are down.
48   */
49  @InterfaceAudience.Public
50  @InterfaceStability.Evolving
51  public class ProcedureMember implements Closeable {
52    private static final Log LOG = LogFactory.getLog(ProcedureMember.class);
53  
54    private final SubprocedureFactory builder;
55    private final ProcedureMemberRpcs rpcs;
56  
57    private final ConcurrentMap<String,Subprocedure> subprocs =
58        new MapMaker().concurrencyLevel(4).weakValues().makeMap();
59    private final ExecutorService pool;
60  
61    /**
62     * Instantiate a new ProcedureMember.  This is a slave that executes subprocedures.
63     *
64     * @param rpcs controller used to send notifications to the procedure coordinator
65     * @param pool thread pool to submit subprocedures
66     * @param factory class that creates instances of a subprocedure.
67     */
68    public ProcedureMember(ProcedureMemberRpcs rpcs, ThreadPoolExecutor pool,
69        SubprocedureFactory factory) {
70      this.pool = pool;
71      this.rpcs = rpcs;
72      this.builder = factory;
73    }
74  
75    public static ThreadPoolExecutor defaultPool(long wakeFrequency, long keepAlive,
76        int procThreads, String memberName) {
77      return new ThreadPoolExecutor(1, procThreads, keepAlive, TimeUnit.SECONDS,
78          new SynchronousQueue<Runnable>(),
79          new DaemonThreadFactory("member: '" + memberName + "' subprocedure-pool"));
80    }
81  
82    /**
83     * Package exposed.  Not for public use.
84     *
85     * @return reference to the Procedure member's rpcs object
86     */
87    ProcedureMemberRpcs getRpcs() {
88       return rpcs;
89    }
90  
91  
92    /**
93     * This is separated from execution so that we can detect and handle the case where the
94     * subprocedure is invalid and inactionable due to bad info (like DISABLED snapshot type being
95     * sent here)
96     * @param opName
97     * @param data
98     * @return subprocedure
99     */
100   public Subprocedure createSubprocedure(String opName, byte[] data) {
101     return builder.buildSubprocedure(opName, data);
102   }
103 
104   /**
105    * Submit an subprocedure for execution.  This starts the local acquire phase.
106    * @param subproc the subprocedure to execute.
107    * @return <tt>true</tt> if the subprocedure was started correctly, <tt>false</tt> if it
108    *         could not be started. In the latter case, the subprocedure holds a reference to
109    *         the exception that caused the failure.
110    */
111   public boolean submitSubprocedure(Subprocedure subproc) {
112      // if the submitted subprocedure was null, bail.
113     if (subproc == null) {
114       LOG.warn("Submitted null subprocedure, nothing to run here.");
115       return false;
116     }
117 
118     String procName = subproc.getName();
119     if (procName == null || procName.length() == 0) {
120       LOG.error("Subproc name cannot be null or the empty string");
121       return false;
122     }
123 
124     // make sure we aren't already running an subprocedure of that name
125     Subprocedure rsub;
126     synchronized (subprocs) {
127       rsub = subprocs.get(procName);
128     }
129     if (rsub != null) {
130       if (!rsub.isComplete()) {
131         LOG.error("Subproc '" + procName + "' is already running. Bailing out");
132         return false;
133       }
134       LOG.warn("A completed old subproc "  +  procName + " is still present, removing");
135       subprocs.remove(procName);
136     }
137 
138     LOG.debug("Submitting new Subprocedure:" + procName);
139 
140     // kick off the subprocedure
141     Future<Void> future = null;
142     try {
143       future = this.pool.submit(subproc);
144       synchronized (subprocs) {
145         subprocs.put(procName, subproc);
146       }
147       return true;
148     } catch (RejectedExecutionException e) {
149       // the thread pool is full and we can't run the subprocedure
150       String msg = "Subprocedure pool is full!";
151       subproc.cancel(msg, e.getCause());
152 
153       // cancel all subprocedures proactively
154       if (future != null) {
155         future.cancel(true);
156       }
157     }
158 
159     LOG.error("Failed to start subprocedure '" + procName + "'");
160     return false;
161   }
162 
163    /**
164     * Notification that procedure coordinator has reached the global barrier
165     * @param procName name of the subprocedure that should start running the the in-barrier phase
166     */
167    public void receivedReachedGlobalBarrier(String procName) {
168      Subprocedure subproc = subprocs.get(procName);
169      if (subproc == null) {
170        LOG.warn("Unexpected reached glabal barrier message for Sub-Procedure '" + procName + "'");
171        return;
172      }
173      subproc.receiveReachedGlobalBarrier();
174    }
175 
176   /**
177    * Best effort attempt to close the threadpool via Thread.interrupt.
178    */
179   @Override
180   public void close() throws IOException {
181     // have to use shutdown now to break any latch waiting
182     pool.shutdownNow();
183   }
184 
185   /**
186    * Shutdown the threadpool, and wait for upto timeoutMs millis before bailing
187    * @param timeoutMs timeout limit in millis
188    * @return true if successfully, false if bailed due to timeout.
189    * @throws InterruptedException
190    */
191   boolean closeAndWait(long timeoutMs) throws InterruptedException {
192     pool.shutdown();
193     return pool.awaitTermination(timeoutMs, TimeUnit.MILLISECONDS);
194   }
195 
196   /**
197    * The connection to the rest of the procedure group (member and coordinator) has been
198    * broken/lost/failed. This should fail any interested subprocedure, but not attempt to notify
199    * other members since we cannot reach them anymore.
200    * @param message description of the error
201    * @param cause the actual cause of the failure
202    *
203    * TODO i'm tempted to just remove this code completely and treat it like any other abort.
204    * Implementation wise, if this happens it is a ZK failure which means the RS will abort.
205    */
206   public void controllerConnectionFailure(final String message, final IOException cause) {
207     Collection<Subprocedure> toNotify = subprocs.values();
208     LOG.error(message, cause);
209     for (Subprocedure sub : toNotify) {
210       // TODO notify the elements, if they aren't null
211       sub.cancel(message, cause);
212     }
213   }
214 
215   /**
216    * Send abort to the specified procedure
217    * @param procName name of the procedure to about
218    * @param ee exception information about the abort
219    */
220   public void receiveAbortProcedure(String procName, ForeignException ee) {
221     LOG.debug("Request received to abort procedure " + procName, ee);
222     // if we know about the procedure, notify it
223     Subprocedure sub = subprocs.get(procName);
224     if (sub == null) {
225       LOG.info("Received abort on procedure with no local subprocedure " + procName +
226           ", ignoring it.", ee);
227       return; // Procedure has already completed
228     }
229     LOG.error("Propagating foreign exception to subprocedure " + sub.getName(), ee);
230     sub.monitor.receive(ee);
231   }
232 }