I'm working with a 10-node Infinispan cluster used as a Hibernate Search backend. Our servers are running TC server 2.5 (tomcat 6.0.32) on Java 1.6_24. We are using jGroups 2.12.1.3 for handling cluster cache writes from each node, and for multicast UDP transport.
When we launch 3+ nodes in our cluster, eventually one of the nodes begins to log replication timeouts. We've observed the same result whether we configure Infinispan for replication or for distribution cache modes. Although the rest of the cluster remains stable, the failing node becomes essentially unsuable for search.
Our configuration:
Infinispan:
Code:
<?xml version="1.0" encoding="UTF-8"?>
<infinispan
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="urn:infinispan:config:5.0 http://www.infinispan.org/schemas/infinispan-config-5.0.xsd"
xmlns="urn:infinispan:config:5.0">
<global>
<globalJmxStatistics
enabled="true"
cacheManagerName="HibernateSearch"
allowDuplicateDomains="true" />
<transport
clusterName="HibernateSearch-Infinispan-cluster-MT"
distributedSyncTimeout="50000">
<properties>
<property name="configurationFile" value="infinispan-udp.cfg.xml" />
</properties>
</transport>
<shutdown
hookBehavior="DONT_REGISTER" />
</global>
<default>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<storeAsBinary storeKeysAsBinary="false" storeValuesAsBinary="true"
enabled="false" />
<invocationBatching
enabled="true" />
<clustering
mode="replication">
<stateRetrieval
timeout="60000"
logFlushTimeout="65000"
fetchInMemoryState="true"
alwaysProvideInMemoryState="true" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<jmxStatistics
enabled="true" />
<eviction
maxEntries="-1"
strategy="NONE" />
<expiration
maxIdle="-1" />
</default>
<namedCache
name="LuceneIndexesMetadata">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<loaders shared="true" preload="true">
<loader class="org.infinispan.loaders.jdbm.JdbmCacheStore" fetchPersistentState="false" ignoreModifications="false" purgeOnStartup="false">
<properties>
<property name="location" value="/usr/local/tc/.index/metadata" />
</properties>
</loader>
</loaders>
</namedCache>
<namedCache
name="LuceneIndexesData">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
<loaders shared="true" preload="true">
<loader class="org.infinispan.loaders.jdbm.JdbmCacheStore" fetchPersistentState="false" ignoreModifications="false" purgeOnStartup="false">
<properties>
<property name="location" value="/usr/local/tc/.index/data" />
</properties>
</loader>
</loaders>
</namedCache>
<namedCache
name="LuceneIndexesLocking">
<clustering
mode="replication">
<stateRetrieval
fetchInMemoryState="true"
logFlushTimeout="30000" />
<sync
replTimeout="50000" />
<l1 enabled="false" />
</clustering>
<locking
lockAcquisitionTimeout="20000"
writeSkewCheck="false"
concurrencyLevel="5000"
useLockStriping="false" />
</namedCache>
</infinispan>
jGroups (UDP):
Code:
<config xmlns="urn:org:jgroups"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="urn:org:jgroups http://www.jgroups.org/schema/JGroups-2.12.xsd">
<UDP
mcast_addr="${jgroups.udp.mcast_addr:228.10.10.9}"
mcast_port="${jgroups.udp.mcast_port:45599}"
tos="8"
ucast_recv_buf_size="20000000"
ucast_send_buf_size="640000"
mcast_recv_buf_size="25000000"
mcast_send_buf_size="640000"
loopback="true"
discard_incompatible_packets="true"
max_bundle_size="64000"
max_bundle_timeout="30"
ip_ttl="${jgroups.udp.ip_ttl:2}"
enable_bundling="true"
enable_diagnostics="false"
thread_naming_pattern="pl"
thread_pool.enabled="true"
thread_pool.min_threads="2"
thread_pool.max_threads="30"
thread_pool.keep_alive_time="5000"
thread_pool.queue_enabled="false"
thread_pool.queue_max_size="100"
thread_pool.rejection_policy="Discard"
oob_thread_pool.enabled="true"
oob_thread_pool.min_threads="2"
oob_thread_pool.max_threads="30"
oob_thread_pool.keep_alive_time="5000"
oob_thread_pool.queue_enabled="false"
oob_thread_pool.queue_max_size="100"
oob_thread_pool.rejection_policy="Discard"
/>
<PING timeout="3000" num_initial_members="10"/>
<MERGE2 max_interval="30000" min_interval="10000"/>
<FD_SOCK/>
<FD/>
<BARRIER />
<pbcast.NAKACK use_stats_for_retransmission="false"
exponential_backoff="0"
use_mcast_xmit="true" gc_lag="0"
retransmit_timeout="300,600,1200"
discard_delivered_msgs="true"/>
<UNICAST timeout="300,600,1200"/>
<pbcast.STABLE stability_delay="1000" desired_avg_gossip="50000" max_bytes="1000000"/>
<pbcast.GMS print_local_addr="true" join_timeout="3000" view_bundling="true"/>
<UFC max_credits="500000" min_threshold="0.20"/>
<MFC max_credits="500000" min_threshold="0.20"/>
<FRAG2 frag_size="60000" />
<pbcast.STREAMING_STATE_TRANSFER/>
</config>
And the errors we observe:
Code:
10-31-2011 13:53:02 ERROR Hibernate Search: Directory writer-3 interceptors.InvocationContextInterceptor: ISPN000136: Execution error
org.infinispan.util.concurrent.TimeoutException: Replication timeout for tc-cluster-0105-21082
at org.infinispan.remoting.transport.AbstractTransport.parseResponseAndAddToResponseList(AbstractTransport.java:71)
at org.infinispan.remoting.transport.jgroups.JGroupsTransport.invokeRemotely(JGroupsTransport.java:452)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:132)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:156)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:265)
at org.infinispan.remoting.rpc.RpcManagerImpl.invokeRemotely(RpcManagerImpl.java:252)
at org.infinispan.remoting.rpc.RpcManagerImpl.broadcastRpcCommand(RpcManagerImpl.java:235)
at org.infinispan.remoting.rpc.RpcManagerImpl.broadcastRpcCommand(RpcManagerImpl.java:228)
at org.infinispan.interceptors.ReplicationInterceptor.handleCrudMethod(ReplicationInterceptor.java:116)
at org.infinispan.interceptors.ReplicationInterceptor.visitPutKeyValueCommand(ReplicationInterceptor.java:79)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.LockingInterceptor.visitPutKeyValueCommand(LockingInterceptor.java:294)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.base.CommandInterceptor.handleDefault(CommandInterceptor.java:133)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.TxInterceptor.enlistWriteAndInvokeNext(TxInterceptor.java:214)
at org.infinispan.interceptors.TxInterceptor.visitPutKeyValueCommand(TxInterceptor.java:162)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.CacheMgmtInterceptor.visitPutKeyValueCommand(CacheMgmtInterceptor.java:114)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.InvocationContextInterceptor.handleAll(InvocationContextInterceptor.java:104)
at org.infinispan.interceptors.InvocationContextInterceptor.handleDefault(InvocationContextInterceptor.java:64)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.base.CommandInterceptor.invokeNextInterceptor(CommandInterceptor.java:119)
at org.infinispan.interceptors.BatchingInterceptor.handleDefault(BatchingInterceptor.java:77)
at org.infinispan.commands.AbstractVisitor.visitPutKeyValueCommand(AbstractVisitor.java:60)
at org.infinispan.commands.write.PutKeyValueCommand.acceptVisitor(PutKeyValueCommand.java:77)
at org.infinispan.interceptors.InterceptorChain.invoke(InterceptorChain.java:274)
at org.infinispan.CacheImpl.putIfAbsent(CacheImpl.java:524)
at org.infinispan.CacheSupport.putIfAbsent(CacheSupport.java:74)
at org.infinispan.lucene.locking.BaseLuceneLock.obtain(BaseLuceneLock.java:65)
at org.apache.lucene.store.Lock.obtain(Lock.java:72)
at org.apache.lucene.index.IndexWriter.<init>(IndexWriter.java:1097)
at org.hibernate.search.backend.Workspace.createNewIndexWriter(Workspace.java:202)
at org.hibernate.search.backend.Workspace.getIndexWriter(Workspace.java:180)
at org.hibernate.search.backend.impl.lucene.PerDPQueueProcessor.run(PerDPQueueProcessor.java:103)
at java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:886)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:908)
at java.lang.Thread.run(Thread.java:662)
Because this error is so pervasive regardless of our topology or caching mode, we believe we must be misconfigured somewhere. Can anyone recommend a fix?