Skip to content

Commit bfc04fc

Browse files
authored
SOLR-18176: HttpShardHandler query throughput bottleneck from ZooKeeper (#4237)
CloudReplicaSource was making a clusterstate call to ZooKeeper for every distributed request if you search over multiple collections, and when the coordinator has no local replica for some of them. This is because the get call was bypassing state cache. This created a severe bottleneck in query throughput so small fix made to just enable cached state lookups.
1 parent bf61053 commit bfc04fc

File tree

3 files changed

+77
-7
lines changed

3 files changed

+77
-7
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
title: Increased query throughput by removing a call to ZooKeeper for cluster state that should have been cached. Happens when Solr does distributed search over multiple collections, and when the coordinator has no local replica for some of them.
2+
type: changed
3+
authors:
4+
- name: Matthew Biscocho
5+
links:
6+
- name: SOLR-18176
7+
url: https://issues.apache.org/jira/browse/SOLR-18176
8+
- name: SOLR-15352
9+
url: https://issues.apache.org/jira/browse/SOLR-15352

solr/core/src/java/org/apache/solr/handler/component/CloudReplicaSource.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,13 @@ private void withShardsParam(Builder builder, String shardsParam) {
109109
if (sliceOrUrl.indexOf('/') < 0) {
110110
// this is a logical shard
111111
this.slices[i] = sliceOrUrl;
112-
replicas[i] =
113-
findReplicas(
114-
builder,
115-
shardsParam,
116-
clusterState,
117-
clusterState.getCollection(builder.collection).getSlice(sliceOrUrl));
112+
DocCollection coll = clusterState.getCollectionOrNull(builder.collection, true);
113+
if (coll == null) {
114+
throw new SolrException(
115+
SolrException.ErrorCode.BAD_REQUEST,
116+
"Could not find collection to resolve replicas: " + builder.collection);
117+
}
118+
replicas[i] = findReplicas(builder, shardsParam, clusterState, coll.getSlice(sliceOrUrl));
118119
} else {
119120
// this has urls
120121
this.replicas[i] = StrUtils.splitSmart(sliceOrUrl, "|", true);
@@ -189,7 +190,12 @@ private void addSlices(
189190
String collectionName,
190191
String shardKeys,
191192
boolean multiCollection) {
192-
DocCollection coll = state.getCollection(collectionName);
193+
DocCollection coll = state.getCollectionOrNull(collectionName, true);
194+
if (coll == null) {
195+
throw new SolrException(
196+
SolrException.ErrorCode.BAD_REQUEST,
197+
"Could not find collection to add slices: " + collectionName);
198+
}
193199
Collection<Slice> slices = coll.getRouter().getSearchSlices(shardKeys, params, coll);
194200
ClientUtils.addSlices(target, collectionName, slices, multiCollection);
195201
}

solr/core/src/test/org/apache/solr/handler/component/DistributedQueryComponentOptimizationTest.java

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,19 @@
2424
import java.util.Set;
2525
import java.util.concurrent.TimeUnit;
2626
import org.apache.solr.BaseDistributedSearchTestCase;
27+
import org.apache.solr.client.solrj.SolrClient;
28+
import org.apache.solr.client.solrj.jetty.HttpJettySolrClient;
2729
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
2830
import org.apache.solr.client.solrj.request.SolrQuery;
2931
import org.apache.solr.client.solrj.request.UpdateRequest;
3032
import org.apache.solr.client.solrj.response.QueryResponse;
3133
import org.apache.solr.cloud.SolrCloudTestCase;
34+
import org.apache.solr.common.cloud.SolrZKMetricsListener;
3235
import org.apache.solr.common.params.CommonParams;
3336
import org.apache.solr.common.params.ShardParams;
3437
import org.apache.solr.common.util.SimpleOrderedMap;
3538
import org.apache.solr.common.util.StrUtils;
39+
import org.apache.solr.embedded.JettySolrRunner;
3640
import org.junit.BeforeClass;
3741
import org.junit.Test;
3842

@@ -707,6 +711,57 @@ private QueryResponse queryWithAsserts(String... q) throws Exception {
707711
return response;
708712
}
709713

714+
/**
715+
* When a node resolves collection state for a collection it doesn't host, queries should use
716+
* cached state and not make ZK calls on every query.
717+
*/
718+
@Test
719+
public void testDistributedQueryDoesNotReadFromZk() throws Exception {
720+
final String secondColl = "secondColl";
721+
722+
// Create a collection on only 1 node so the other node uses LazyCollectionRef for state
723+
List<JettySolrRunner> jettys = cluster.getJettySolrRunners();
724+
CollectionAdminRequest.createCollection(secondColl, "conf", 1, 1)
725+
.setCreateNodeSet(jettys.get(0).getNodeName())
726+
.processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT);
727+
cluster
728+
.getZkStateReader()
729+
.waitForState(
730+
secondColl,
731+
DEFAULT_TIMEOUT,
732+
TimeUnit.SECONDS,
733+
(n, c) -> SolrCloudTestCase.replicasForCollectionAreFullyActive(n, c, 1, 1));
734+
735+
try {
736+
// Node 1 hosts COLLECTION but not secondColl.
737+
// Send a multi-collection query to trigger LazyCollectionRef get call
738+
JettySolrRunner nodeWithoutSecondColl = jettys.get(1);
739+
try (SolrClient client =
740+
new HttpJettySolrClient.Builder(nodeWithoutSecondColl.getBaseUrl().toString()).build()) {
741+
742+
String collectionsParameter = COLLECTION + "," + secondColl;
743+
744+
// Warm up LazyCollectionRef state cache with query
745+
client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter));
746+
747+
// Get ZK metrics from the coordinator node (the one we're querying)
748+
SolrZKMetricsListener metrics =
749+
nodeWithoutSecondColl.getCoreContainer().getZkController().getZkClient().getMetrics();
750+
long existsBefore = metrics.getExistsChecks();
751+
752+
// Query again and assert that exists call is not made
753+
client.query(COLLECTION, new SolrQuery("q", "*:*", "collection", collectionsParameter));
754+
assertEquals(
755+
"Query should not cause ZK exists checks as collection state should be cached",
756+
existsBefore,
757+
metrics.getExistsChecks());
758+
}
759+
} finally {
760+
CollectionAdminRequest.deleteCollection(secondColl)
761+
.processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT);
762+
}
763+
}
764+
710765
private int getNumRequests(
711766
Map<String, List<TrackingShardHandlerFactory.ShardRequestAndParams>> requests) {
712767
int beforeNumRequests = 0;

0 commit comments

Comments
 (0)