File: test_backpressure.py

package info (click to toggle)
python-cassandra-driver 3.29.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,144 kB
  • sloc: python: 51,532; ansic: 768; makefile: 136; sh: 13
file content (179 lines) | stat: -rw-r--r-- 7,422 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Copyright DataStax, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time

from cassandra import OperationTimedOut
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT, NoHostAvailable
from cassandra.policies import RoundRobinPolicy, WhiteListRoundRobinPolicy
from tests.integration import requiressimulacron, libevtest
from tests.integration.simulacron import SimulacronBase, PROTOCOL_VERSION
from tests.integration.simulacron.utils import ResumeReads, PauseReads, prime_request, start_and_prime_singledc


@requiressimulacron
@libevtest
class TCPBackpressureTests(SimulacronBase):
    def setUp(self):
        self.callback_successes = 0
        self.callback_errors = 0

    def callback_success(self, results):
        self.callback_successes += 1

    def callback_error(self, results):
        self.callback_errors += 1

    def _fill_buffers(self, session, query, expected_blocked=3, **execute_kwargs):
        futures = []
        buffer = '1' * 50000
        for _ in range(100000):
            future = session.execute_async(query, [buffer], **execute_kwargs)
            futures.append(future)

            total_blocked = 0
            for pool in session.get_pools():
                if not pool._connection._socket_writable:
                    total_blocked += 1
            if total_blocked >= expected_blocked:
                break
        else:
            raise Exception("Unable to fill TCP send buffer on expected number of nodes")
        return futures

    def test_paused_connections(self):
        """  Verify all requests come back as expected if node resumes within query timeout """
        start_and_prime_singledc()
        profile = ExecutionProfile(request_timeout=500, load_balancing_policy=RoundRobinPolicy())
        cluster = Cluster(
            protocol_version=PROTOCOL_VERSION,
            compression=False,
            execution_profiles={EXEC_PROFILE_DEFAULT: profile},
        )
        session = cluster.connect(wait_for_all_pools=True)
        self.addCleanup(cluster.shutdown)

        query = session.prepare("INSERT INTO table1 (id) VALUES (?)")

        prime_request(PauseReads())
        futures = self._fill_buffers(session, query)

        # Make sure we actually have some stuck in-flight requests
        for in_flight in [pool._connection.in_flight for pool in session.get_pools()]:
            self.assertGreater(in_flight, 100)
        time.sleep(.5)
        for in_flight in [pool._connection.in_flight for pool in session.get_pools()]:
            self.assertGreater(in_flight, 100)

        prime_request(ResumeReads())

        for future in futures:
            try:
                future.result()
            except NoHostAvailable as e:
                # We shouldn't have any timeouts here, but all of the queries beyond what can fit
                # in the tcp buffer will have returned with a ConnectionBusy exception
                self.assertIn("ConnectionBusy", str(e))

        # Verify that we can continue sending queries without any problems
        for host in session.cluster.metadata.all_hosts():
            session.execute(query, ["a"], host=host)

    def test_queued_requests_timeout(self):
        """ Verify that queued requests timeout as expected """
        start_and_prime_singledc()
        profile = ExecutionProfile(request_timeout=.1, load_balancing_policy=RoundRobinPolicy())
        cluster = Cluster(
            protocol_version=PROTOCOL_VERSION,
            compression=False,
            execution_profiles={EXEC_PROFILE_DEFAULT: profile},
        )
        session = cluster.connect(wait_for_all_pools=True)
        self.addCleanup(cluster.shutdown)

        query = session.prepare("INSERT INTO table1 (id) VALUES (?)")

        prime_request(PauseReads())

        futures = []
        for i in range(1000):
            future = session.execute_async(query, [str(i)])
            future.add_callbacks(callback=self.callback_success, errback=self.callback_error)
            futures.append(future)

        successes = 0
        for future in futures:
            try:
                future.result()
                successes += 1
            except OperationTimedOut:
                pass

        # Simulacron will respond to a couple queries before cutting off reads, so we'll just verify
        # that only "a few" successes happened here
        self.assertLess(successes, 50)
        self.assertLess(self.callback_successes, 50)
        self.assertEqual(self.callback_errors, len(futures) - self.callback_successes)

    def test_cluster_busy(self):
        """ Verify that once TCP buffer is full we get busy exceptions rather than timeouts """
        start_and_prime_singledc()
        profile = ExecutionProfile(load_balancing_policy=RoundRobinPolicy())
        cluster = Cluster(
            protocol_version=PROTOCOL_VERSION,
            compression=False,
            execution_profiles={EXEC_PROFILE_DEFAULT: profile},
        )
        session = cluster.connect(wait_for_all_pools=True)
        self.addCleanup(cluster.shutdown)

        query = session.prepare("INSERT INTO table1 (id) VALUES (?)")

        prime_request(PauseReads())

        # These requests will get stuck in the TCP buffer and we have no choice but to let them time out
        self._fill_buffers(session, query, expected_blocked=3)

        # Now that our send buffer is completely full, verify we immediately get busy exceptions rather than timing out
        for i in range(1000):
            with self.assertRaises(NoHostAvailable) as e:
                session.execute(query, [str(i)])
            self.assertIn("ConnectionBusy", str(e.exception))

    def test_node_busy(self):
        """ Verify that once TCP buffer is full, queries continue to get re-routed to other nodes """
        start_and_prime_singledc()
        profile = ExecutionProfile(load_balancing_policy=RoundRobinPolicy())
        cluster = Cluster(
            protocol_version=PROTOCOL_VERSION,
            compression=False,
            execution_profiles={EXEC_PROFILE_DEFAULT: profile},
        )
        session = cluster.connect(wait_for_all_pools=True)
        self.addCleanup(cluster.shutdown)

        query = session.prepare("INSERT INTO table1 (id) VALUES (?)")

        prime_request(PauseReads(dc_id=0, node_id=0))

        blocked_profile = ExecutionProfile(load_balancing_policy=WhiteListRoundRobinPolicy(["127.0.0.1"]))
        cluster.add_execution_profile('blocked_profile', blocked_profile)

        # Fill our blocked node's tcp buffer until we get a busy exception
        self._fill_buffers(session, query, expected_blocked=1, execution_profile='blocked_profile')

        # Now that our send buffer is completely full on one node,
        # verify queries get re-routed to other nodes and queries complete successfully
        for i in range(1000):
            session.execute(query, [str(i)])