Hello,
We’re running into a bit of a strange issue with the Prefect Worker running within a self-hosted Kubernetes cluster, where after running and spawning Kubernetes Jobs without any issues for several hours, we start getting the following error message “Flow run could not be submitted to infrastructure”.
Looking at the logs for the worker/flow, we see the following trace being reported.
Failed to submit flow run '2cb29929-2684-43e5-a51b-62a725d0f512' to infrastructure.
Traceback (most recent call last):
File "/usr/local/lib/python3.11/site-packages/urllib3/connection.py", line 174, in _new_conn
conn = connection.create_connection(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/util/connection.py", line 95, in create_connection
raise err
File "/usr/local/lib/python3.11/site-packages/urllib3/util/connection.py", line 85, in create_connection
sock.connect(sa)
OSError: [Errno 99] Cannot assign requested address
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.11/site-packages/urllib3/connectionpool.py", line 714, in urlopen
httplib_response = self._make_request(
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/connectionpool.py", line 403, in _make_request
self._validate_conn(conn)
File "/usr/local/lib/python3.11/site-packages/urllib3/connectionpool.py", line 1053, in _validate_conn
conn.connect()
File "/usr/local/lib/python3.11/site-packages/urllib3/connection.py", line 363, in connect
self.sock = conn = self._new_conn()
^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/connection.py", line 186, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x7f66b5396710>: Failed to establish a new connection: [Errno 99] Cannot assign requested address
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.11/site-packages/prefect/workers/base.py", line 834, in _submit_run_and_capture_errors
result = await self.run(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/prefect_kubernetes/worker.py", line 506, in run
job = await run_sync_in_worker_thread(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/prefect/utilities/asyncutils.py", line 91, in run_sync_in_worker_thread
return await anyio.to_thread.run_sync(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/anyio/to_thread.py", line 33, in run_sync
return await get_asynclib().run_sync_in_worker_thread(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
return await future
^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/anyio/_backends/_asyncio.py", line 807, in run
result = context.run(func, *args)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/prefect_kubernetes/worker.py", line 628, in _create_job
job = batch_client.create_namespaced_job(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/kubernetes/client/api/batch_v1_api.py", line 210, in create_namespaced_job
return self.create_namespaced_job_with_http_info(namespace, body, **kwargs) # noqa: E501
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/kubernetes/client/api/batch_v1_api.py", line 309, in create_namespaced_job_with_http_info
return self.api_client.call_api(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/kubernetes/client/api_client.py", line 348, in call_api
return self.__call_api(resource_path, method,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/kubernetes/client/api_client.py", line 180, in __call_api
response_data = self.request(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/kubernetes/client/api_client.py", line 391, in request
return self.rest_client.POST(url,
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/kubernetes/client/rest.py", line 276, in POST
return self.request("POST", url,
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/kubernetes/client/rest.py", line 169, in request
r = self.pool_manager.request(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/request.py", line 78, in request
return self.request_encode_body(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/request.py", line 170, in request_encode_body
return self.urlopen(method, url, **extra_kw)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/poolmanager.py", line 376, in urlopen
response = conn.urlopen(method, u.request_uri, **kw)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/connectionpool.py", line 826, in urlopen
return self.urlopen(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/connectionpool.py", line 826, in urlopen
return self.urlopen(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/connectionpool.py", line 826, in urlopen
return self.urlopen(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/connectionpool.py", line 798, in urlopen
retries = retries.increment(
^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/site-packages/urllib3/util/retry.py", line 592, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='172.30.0.1', port=443): Max retries exceeded with url: /apis/batch/v1/namespaces/prefect-2x/jobs (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f66b5396710>: Failed to establish a new connection: [Errno 99] Cannot assign requested address'))
Once this issue arises, we have to restart the Prefect Worker (and any flows that are stuck as a fallout from the issue) in order for the system to function again.
Has any one experienced this issue or have an idea to troubleshoot/fix this?
We’re currently using the following image for the worker prefect:2.11.2-python3.11-kubernetes