drm/i915: clear up wedged transitions
We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
This commit is contained in:
@@ -771,11 +771,37 @@ struct i915_gpu_error {
|
||||
/* Protected by the above dev->gpu_error.lock. */
|
||||
struct drm_i915_error_state *first_error;
|
||||
struct work_struct work;
|
||||
struct completion completion;
|
||||
|
||||
unsigned long last_reset;
|
||||
|
||||
atomic_t wedged;
|
||||
/**
|
||||
* State variable controlling the reset flow
|
||||
*
|
||||
* Upper bits are for the reset counter.
|
||||
*
|
||||
* Lowest bit controls the reset state machine: Set means a reset is in
|
||||
* progress. This state will (presuming we don't have any bugs) decay
|
||||
* into either unset (successful reset) or the special WEDGED value (hw
|
||||
* terminally sour). All waiters on the reset_queue will be woken when
|
||||
* that happens.
|
||||
*/
|
||||
atomic_t reset_counter;
|
||||
|
||||
/**
|
||||
* Special values/flags for reset_counter
|
||||
*
|
||||
* Note that the code relies on
|
||||
* I915_WEDGED & I915_RESET_IN_PROGRESS_FLAG
|
||||
* being true.
|
||||
*/
|
||||
#define I915_RESET_IN_PROGRESS_FLAG 1
|
||||
#define I915_WEDGED 0xffffffff
|
||||
|
||||
/**
|
||||
* Waitqueue to signal when the reset has completed. Used by clients
|
||||
* that wait for dev_priv->mm.wedged to settle.
|
||||
*/
|
||||
wait_queue_head_t reset_queue;
|
||||
|
||||
/* For gpu hang simulation. */
|
||||
unsigned int stop_rings;
|
||||
@@ -1543,6 +1569,16 @@ void i915_gem_retire_requests(struct drm_device *dev);
|
||||
void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
|
||||
int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
|
||||
bool interruptible);
|
||||
static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
|
||||
{
|
||||
return unlikely(atomic_read(&error->reset_counter)
|
||||
& I915_RESET_IN_PROGRESS_FLAG);
|
||||
}
|
||||
|
||||
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
|
||||
{
|
||||
return atomic_read(&error->reset_counter) == I915_WEDGED;
|
||||
}
|
||||
|
||||
void i915_gem_reset(struct drm_device *dev);
|
||||
void i915_gem_clflush_object(struct drm_i915_gem_object *obj);
|
||||
|
||||
Reference in New Issue
Block a user