View Javadoc

1   package org.apache.jcs.auxiliary.remote;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.apache.commons.logging.Log;
23  import org.apache.commons.logging.LogFactory;
24  import org.apache.jcs.engine.CacheConstants;
25  import org.apache.jcs.engine.behavior.ICache;
26  import org.apache.jcs.engine.behavior.ICompositeCacheManager;
27  
28  /***
29   * The RemoteCacheFailoverRunner tries to establish a connection with a failover
30   * server, if any are defined. Once a failover connectin is made, it will
31   * attempt to replace the failover with the primary remote server.
32   * <p>
33   * It works by switching out the RemoteCacheNoWait inside the Facade.
34   * <p>
35   * Client (i.e.) the CompositeCache has refernce to a RemoteCacheNoWaitFacade.
36   * This facade is created by the RemoteCacheFactory. The factory maintains a set
37   * of managers, one for each remote server. Typically, there will only be one
38   * manager.
39   * <p>
40   * If you use multipleremote servesr, you may want to set one or more as
41   * failovers. If a local cache cannot connect to the primary server, or looses
42   * its connection to the primary server, it will attempt to restore that
43   * connectin in the background. If failovers are defined, the Failover runner
44   * will try to connect to a failover until the primary is restored.
45   *
46   */
47  public class RemoteCacheFailoverRunner
48      implements Runnable
49  {
50      private final static Log log = LogFactory.getLog( RemoteCacheFailoverRunner.class );
51  
52      private RemoteCacheNoWaitFacade facade;
53  
54      private static long idlePeriod = 20 * 1000;
55  
56      private boolean alright = true;
57  
58      private ICompositeCacheManager cacheMgr;
59  
60      /***
61       * Constructor for the RemoteCacheFailoverRunner object. This allows the
62       * FailoverRunner to modify the facade that the CompositeCache references.
63       *
64       * @param facade
65       *            the facade the CompositeCache talks to.
66       * @param cacheMgr
67       */
68      public RemoteCacheFailoverRunner( RemoteCacheNoWaitFacade facade, ICompositeCacheManager cacheMgr )
69      {
70          this.facade = facade;
71          this.cacheMgr = cacheMgr;
72      }
73  
74      /***
75       * Notifies the cache monitor that an error occurred, and kicks off the
76       * error recovery process.
77       */
78      public void notifyError()
79      {
80          bad();
81          synchronized ( this )
82          {
83              notify();
84          }
85      }
86  
87      /***
88       * Main processing method for the RemoteCacheFailoverRunner object.
89       * <p>
90       * If we do not have a connection with any failover server, this will try to
91       * connect one at a time. If no connection can be made, it goes to sleep for
92       * a while (20 seconds).
93       * <p>
94       * Once a connection with a failover is made, we will try to reconnect to
95       * the primary server.
96       * <p>
97       * The primary server is the first server defines in the FailoverServers
98       * list.
99       */
100     public void run()
101     {
102         // start the main work of connecting to a failover and then restoring
103         // the primary.
104         connectAndRestore();
105 
106         if ( log.isInfoEnabled() )
107         {
108             log.info( "Exiting failover runner. Failover index = " + facade.remoteCacheAttributes.getFailoverIndex() );
109             if ( facade.remoteCacheAttributes.getFailoverIndex() <= 0 )
110             {
111                 log.info( "Failover index is <= 0, meaning we are not " + "connected to a failover server." );
112             }
113             else if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
114             {
115                 log.info( "Failover index is > 0, meaning we are " + "connected to a failover server." );
116             }
117             // log if we are alright or not.
118         }
119         return;
120     }
121 
122     /***
123      * This is the main loop. If there are failovers defined, then this will
124      * continue until the primary is re-connected. If no failovers are defined,
125      * this will exit automatically.
126      */
127     private void connectAndRestore()
128     {
129         do
130         {
131             log.info( "Remote cache FAILOVER RUNNING." );
132 
133             // there is no active listener
134             if ( !alright )
135             {
136                 // Monitor each RemoteCacheManager instance one after the other.
137                 // Each RemoteCacheManager corresponds to one remote connection.
138                 String[] failovers = facade.remoteCacheAttributes.getFailovers();
139                 // we should probalby check to see if there are any failovers,
140                 // even though the caller
141                 // should have already.
142 
143                 if ( failovers == null )
144                 {
145                     log.warn( "Remote is misconfigured, failovers was null." );
146                     return;
147                 }
148                 else if ( failovers.length == 1 )
149                 {
150                     // if there is only the primary, return out of this
151                     if ( log.isInfoEnabled() )
152                     {
153                         log.info( "No failovers defined, exiting failover runner." );
154                         return;
155                     }
156                 }
157 
158                 int fidx = facade.remoteCacheAttributes.getFailoverIndex();
159                 log.debug( "fidx = " + fidx + " failovers.length = " + failovers.length );
160 
161                 // shouldn't we see if the primary is backup?
162                 // If we don't check the primary, if it gets connected in the
163                 // backgorund,
164                 // we will disconnect it only to put it right back
165                 int i = fidx; // + 1; // +1 skips the primary
166                 if ( log.isDebugEnabled() )
167                 {
168                     log.debug( "stating at failover i = " + i );
169                 }
170 
171                 // try them one at a time until successful
172                 for ( ; i < failovers.length && !alright; i++ )
173                 {
174                     String server = failovers[i];
175                     if ( log.isDebugEnabled() )
176                     {
177                         log.debug( "Trying server [" + server + "] at failover index i = " + i );
178                     }
179 
180                     RemoteCacheAttributes rca = null;
181                     try
182                     {
183                         rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
184                         rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
185                         rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
186                         RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr );
187 
188                         if ( log.isDebugEnabled() )
189                         {
190                             log.debug( "RemoteCacheAttributes for failover = " + rca.toString() );
191                         }
192 
193                         // add a listener if there are none, need to tell rca
194                         // what number it is at
195                         ICache ic = rcm.getCache( rca.getCacheName() );
196                         if ( ic != null )
197                         {
198                             if ( ic.getStatus() == CacheConstants.STATUS_ALIVE )
199                             {
200                                 // may need to do this more gracefully
201                                 log.debug( "reseting no wait" );
202                                 facade.noWaits = new RemoteCacheNoWait[1];
203                                 facade.noWaits[0] = (RemoteCacheNoWait) ic;
204                                 facade.remoteCacheAttributes.setFailoverIndex( i );
205 
206                                 synchronized ( this )
207                                 {
208                                     if ( log.isDebugEnabled() )
209                                     {
210                                         log.debug( "setting ALRIGHT to true" );
211                                         if ( i > 0 )
212                                         {
213                                             log.debug( "Moving to Primary Recovery Mode, failover index = " + i );
214                                         }
215                                         else
216                                         {
217                                             if ( log.isInfoEnabled() )
218                                             {
219                                                 String message = "No need to connect to failover, the primary server is back up.";
220                                                 log.info( message );
221                                             }
222                                         }
223                                     }
224 
225                                     alright = true;
226 
227                                     if ( log.isInfoEnabled() )
228                                     {
229                                         log.info( "CONNECTED to host = [" + rca.getRemoteHost() + "] port = ["
230                                             + rca.getRemotePort() + "]" );
231                                     }
232                                 }
233                             }
234                         }
235                         else
236                         {
237                             log.info( "noWait is null" );
238                         }
239                     }
240                     catch ( Exception ex )
241                     {
242                         bad();
243                         // Problem encountered in fixing the caches managed by a
244                         // RemoteCacheManager instance.
245                         // Soldier on to the next RemoteCacheManager instance.
246                         if ( i == 0 )
247                         {
248                             log.warn( "FAILED to connect, as expected, to primary" + rca.getRemoteHost() + ":"
249                                 + rca.getRemotePort(), ex );
250                         }
251                         else
252                         {
253                             log.error( "FAILED to connect to failover [" + rca.getRemoteHost() + ":"
254                                 + rca.getRemotePort() + "]", ex );
255                         }
256                     }
257                 }
258             }
259             // end if !alright
260             // get here if while index >0 and alright, meaning that we are
261             // connected to some backup server.
262             else
263             {
264                 if ( log.isDebugEnabled() )
265                 {
266                     log.debug( "ALRIGHT is true " );
267                 }
268                 if ( log.isInfoEnabled() )
269                 {
270                     log.info( "Failover runner is in primary recovery mode. Failover index = "
271                         + facade.remoteCacheAttributes.getFailoverIndex() + "\n" + "Will now try to reconnect to primary server." );
272                 }
273             }
274 
275             boolean primaryRestoredSuccessfully = false;
276             // if we are not connected to the primary, try.
277             if ( facade.remoteCacheAttributes.getFailoverIndex() > 0 )
278             {
279                 primaryRestoredSuccessfully = restorePrimary();
280                 if ( log.isDebugEnabled() )
281                 {
282                     log.debug( "Primary recovery success state = " + primaryRestoredSuccessfully );
283                 }
284             }
285 
286             if ( !primaryRestoredSuccessfully )
287             {
288                 // Time driven mode: sleep between each round of recovery
289                 // attempt.
290                 try
291                 {
292                     log.warn( "Failed to reconnect to primary server. Cache failover runner is going to sleep for "
293                         + idlePeriod + " milliseconds." );
294                     Thread.sleep( idlePeriod );
295                 }
296                 catch ( InterruptedException ex )
297                 {
298                     // ignore;
299                 }
300             }
301 
302             // try to bring the listener back to the primary
303         }
304         while ( facade.remoteCacheAttributes.getFailoverIndex() > 0 || !alright );
305         // continue if the primary is not restored or if things are not alright.
306 
307     }
308 
309     /***
310      * Try to restore the primary server.
311      * <p>
312      * Once primary is restored the failover listener must be deregistered.
313      * <p>
314      * The primary server is the first server defines in the FailoverServers
315      * list.
316      *
317      * @return boolean value indicating whether the resoration was successful
318      */
319     private boolean restorePrimary()
320     {
321         // try to move back to the primary
322         String[] failovers = facade.remoteCacheAttributes.getFailovers();
323         String server = failovers[0];
324 
325         if ( log.isInfoEnabled() )
326         {
327             log.info( "Trying to restore connection to primary remote server [" + server + "]" );
328         }
329 
330         try
331         {
332             RemoteCacheAttributes rca = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
333             rca.setRemoteHost( server.substring( 0, server.indexOf( ":" ) ) );
334             rca.setRemotePort( Integer.parseInt( server.substring( server.indexOf( ":" ) + 1 ) ) );
335             RemoteCacheManager rcm = RemoteCacheManager.getInstance( rca, cacheMgr );
336 
337             // add a listener if there are none, need to tell rca what number it
338             // is at
339             ICache ic = rcm.getCache( rca.getCacheName() );
340             // by default the listener id should be 0, else it will be the
341             // listener
342             // orignally associated with the remote cache. either way is fine.
343             // We just don't want the listener id from a failover being used.
344             // If the remote server was rebooted this couldbe a problem if new
345             // locals were also added.
346 
347             if ( ic != null )
348             {
349                 if ( ic.getStatus() == CacheConstants.STATUS_ALIVE )
350                 {
351                     try
352                     {
353                         // we could have more than one listener registered right
354                         // now.
355                         // this will not result in a loop, only duplication
356                         // stop duplicate listening.
357                         if ( facade.noWaits[0] != null && facade.noWaits[0].getStatus() == CacheConstants.STATUS_ALIVE )
358                         {
359                             int fidx = facade.remoteCacheAttributes.getFailoverIndex();
360 
361                             if ( fidx > 0 )
362                             {
363                                 String serverOld = failovers[fidx];
364 
365                                 if ( log.isDebugEnabled() )
366                                 {
367                                     log.debug( "Failover Index = " + fidx + " the server at that index is ["
368                                         + serverOld + "]" );
369                                 }
370 
371                                 if ( serverOld != null )
372                                 {
373                                     // create attributes that reflect the
374                                     // previous failed over configuration.
375                                     RemoteCacheAttributes rcaOld = (RemoteCacheAttributes) facade.remoteCacheAttributes.copy();
376                                     rcaOld.setRemoteHost( serverOld.substring( 0, serverOld.indexOf( ":" ) ) );
377                                     rcaOld.setRemotePort( Integer.parseInt( serverOld.substring( serverOld
378                                         .indexOf( ":" ) + 1 ) ) );
379                                     RemoteCacheManager rcmOld = RemoteCacheManager.getInstance( rcaOld, cacheMgr );
380 
381                                     if ( rcmOld != null )
382                                     {
383                                         // manager can remove by name if
384                                         // necessary
385                                         rcmOld.removeRemoteCacheListener( rcaOld );
386                                     }
387                                     if ( log.isInfoEnabled() )
388                                     {
389                                         log.info( "Successfully deregistered from FAILOVER remote server = "
390                                             + serverOld );
391                                     }
392                                 }
393                             }
394                             else if ( fidx == 0 )
395                             {
396                                 // this should never happen. If there are no
397                                 // failovers this shouldn't get called.
398                                 if ( log.isDebugEnabled() )
399                                 {
400                                     log.debug( "No need to restore primary, it is already restored." );
401                                     return true;
402                                 }
403                             }
404                             else if ( fidx < 0 )
405                             {
406                                 // this should never happen
407                                 log.warn( "Failover index is less than 0, this shouldn't happen" );
408                             }
409                         }
410                     }
411                     catch ( Exception e )
412                     {
413                         // TODO, should try again, or somehow stop the listener
414                         log.error(
415                                    "Trouble trying to deregister old failover listener prior to restoring the primary = "
416                                        + server, e );
417                     }
418 
419                     // Restore primary
420                     // may need to do this more gracefully, letting the failover finish in the background
421                     RemoteCacheNoWait failoverNoWait = facade.noWaits[0];
422 
423                     // swap in a new one
424                     facade.noWaits = new RemoteCacheNoWait[1];
425                     facade.noWaits[0] = (RemoteCacheNoWait) ic;
426                     facade.remoteCacheAttributes.setFailoverIndex( 0 );
427 
428                     if ( log.isInfoEnabled() )
429                     {
430                         log.info( "Successfully reconnected to PRIMARY remote server.  Substituted primary for failoverNoWait [" + failoverNoWait + "]" );
431                     }
432                     return true;
433                 }
434 
435                 // else alright
436                 // if the failover index was at 0 here, we would be in a bad
437                 // situation, unless there were jsut
438                 // no failovers configured.
439                 if ( log.isDebugEnabled() )
440                 {
441                     log.debug( "Primary server status in error, not connected." );
442                 }
443             }
444             else
445             {
446                 if ( log.isDebugEnabled() )
447                 {
448                     log.debug( "Primary server is null, not connected." );
449                 }
450             }
451         }
452         catch ( Exception ex )
453         {
454             log.error( ex );
455         }
456         return false;
457     }
458 
459     /***
460      * Sets the "alright" flag to false in a critial section. This flag
461      * indicates whether or not we are connected to any server at all. If we are
462      * connected to a secondary server, then alright will be true, but we will
463      * continue to try to restore the connetion with the primary server.
464      * <p>
465      * The primary server is the first server defines in the FailoverServers
466      * list.
467      */
468     private void bad()
469     {
470         if ( alright )
471         {
472             synchronized ( this )
473             {
474                 alright = false;
475             }
476         }
477     }
478 }