From b3001595abb75db85a3e83d7105265f9c10bd644 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Wed, 29 Apr 2026 12:02:04 +0000 Subject: [PATCH] fix: use RecoverableErrors isinstance check for repost timeout status (backport #54543) (#54649) fix: use RecoverableErrors isinstance check for repost timeout status When a Repost Item Valuation job is killed by an RQ worker timeout (JobTimeoutException raised via SIGALRM), the existing status detection relied solely on traceback string matching for 'timeout' or 'Deadlock'. This is unreliable because SIGALRM can interrupt a C-extension call (e.g. inside pypika's copy.copy()) before Python records the exception in the traceback. In that case the traceback shows only the interrupted frame -- not JobTimeoutException -- so the job is permanently marked 'Failed' instead of 'In Progress', preventing the scheduler from automatically retrying it. RecoverableErrors = (JobTimeoutException, QueryDeadlockError, QueryTimeoutError) is already defined at the top of this file and is already used further down in the same except block to suppress email notifications. Extend its use to also guard the status decision. The traceback string fallback is kept as a secondary check for forward compatibility with other timeout signals. Fixes: jobs permanently stuck as 'Failed' after RQ worker timeout, requiring manual re-queue to resume reposting. (cherry picked from commit a49e2de8667e3a4fded04338be9c5ff4cbb6bba0) Co-authored-by: Assem Bahnasy --- .../repost_item_valuation/repost_item_valuation.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/erpnext/stock/doctype/repost_item_valuation/repost_item_valuation.py b/erpnext/stock/doctype/repost_item_valuation/repost_item_valuation.py index c63c2d34916..f4a0c16101c 100644 --- a/erpnext/stock/doctype/repost_item_valuation/repost_item_valuation.py +++ b/erpnext/stock/doctype/repost_item_valuation/repost_item_valuation.py @@ -411,8 +411,15 @@ def repost(doc): message = message.get("message") status = "Failed" - # If failed because of timeout, set status to In Progress - if traceback and ("timeout" in traceback.lower() or "Deadlock found" in traceback): + # If failed because of a recoverable error (timeout, deadlock), set status to In Progress + # so the scheduler automatically retries instead of leaving it permanently failed. + # NOTE: isinstance check comes first because the traceback string matching is unreliable + # when SIGALRM kills the process mid-C-extension (JobTimeoutException may not appear + # in the traceback if the exception handler itself was interrupted). + traceback_lower = traceback.lower() if traceback else "" + if isinstance(e, RecoverableErrors) or ( + traceback_lower and ("timeout" in traceback_lower or "deadlock found" in traceback_lower) + ): status = "In Progress" if traceback: