drm/amdgpu: remove unnecessary reading for epprom header
If the number of badpage records exceed the threshold, driver has updated both epprom header and control->tbl_hdr.header before gpu reset, therefore GPU recovery thread no need to read epprom header directly. v2: merge amdgpu_ras_check_err_threshold into amdgpu_ras_eeprom_check_err_threshold Signed-off-by: Dennis Li <Dennis.Li@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
@@ -4399,7 +4399,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
|
|||||||
* bad_page_threshold value to fix this once
|
* bad_page_threshold value to fix this once
|
||||||
* probing driver again.
|
* probing driver again.
|
||||||
*/
|
*/
|
||||||
if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
|
if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
|
||||||
/* must succeed. */
|
/* must succeed. */
|
||||||
amdgpu_ras_resume(tmp_adev);
|
amdgpu_ras_resume(tmp_adev);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -2189,19 +2189,3 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
|
|||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
|
|
||||||
{
|
|
||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
|
||||||
bool exc_err_limit = false;
|
|
||||||
|
|
||||||
if (con && (amdgpu_bad_page_threshold != 0))
|
|
||||||
amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control,
|
|
||||||
&exc_err_limit);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We are only interested in variable exc_err_limit,
|
|
||||||
* as it says if GPU is in bad state or not.
|
|
||||||
*/
|
|
||||||
return exc_err_limit;
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -491,8 +491,6 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
|
|||||||
unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
|
unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
|
||||||
bool is_ce);
|
bool is_ce);
|
||||||
|
|
||||||
bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
|
|
||||||
|
|
||||||
/* error handling functions */
|
/* error handling functions */
|
||||||
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
|
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
|
||||||
struct eeprom_table_record *bps, int pages);
|
struct eeprom_table_record *bps, int pages);
|
||||||
|
|||||||
@@ -434,47 +434,21 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
|
|||||||
return curr_address;
|
return curr_address;
|
||||||
}
|
}
|
||||||
|
|
||||||
int amdgpu_ras_eeprom_check_err_threshold(
|
bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
|
||||||
struct amdgpu_ras_eeprom_control *control,
|
|
||||||
bool *exceed_err_limit)
|
|
||||||
{
|
{
|
||||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
unsigned char buff[EEPROM_ADDRESS_SIZE +
|
|
||||||
EEPROM_TABLE_HEADER_SIZE] = { 0 };
|
|
||||||
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
|
|
||||||
struct i2c_msg msg = {
|
|
||||||
.addr = control->i2c_address,
|
|
||||||
.flags = I2C_M_RD,
|
|
||||||
.len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
|
|
||||||
.buf = buff,
|
|
||||||
};
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
*exceed_err_limit = false;
|
|
||||||
|
|
||||||
if (!__is_ras_eeprom_supported(adev))
|
if (!__is_ras_eeprom_supported(adev))
|
||||||
return 0;
|
return false;
|
||||||
|
|
||||||
/* read EEPROM table header */
|
if (con->eeprom_control.tbl_hdr.header == EEPROM_TABLE_HDR_BAD) {
|
||||||
mutex_lock(&control->tbl_mutex);
|
|
||||||
ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
|
|
||||||
if (ret < 1) {
|
|
||||||
dev_err(adev->dev, "Failed to read EEPROM table header.\n");
|
|
||||||
goto err;
|
|
||||||
}
|
|
||||||
|
|
||||||
__decode_table_header_from_buff(hdr, &buff[2]);
|
|
||||||
|
|
||||||
if (hdr->header == EEPROM_TABLE_HDR_BAD) {
|
|
||||||
dev_warn(adev->dev, "This GPU is in BAD status.");
|
dev_warn(adev->dev, "This GPU is in BAD status.");
|
||||||
dev_warn(adev->dev, "Please retire it or setting one bigger "
|
dev_warn(adev->dev, "Please retire it or setting one bigger "
|
||||||
"threshold value when reloading driver.\n");
|
"threshold value when reloading driver.\n");
|
||||||
*exceed_err_limit = true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
err:
|
return false;
|
||||||
mutex_unlock(&control->tbl_mutex);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
||||||
|
|||||||
@@ -80,9 +80,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
|
|||||||
bool *exceed_err_limit);
|
bool *exceed_err_limit);
|
||||||
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
|
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
|
||||||
|
|
||||||
int amdgpu_ras_eeprom_check_err_threshold(
|
bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev);
|
||||||
struct amdgpu_ras_eeprom_control *control,
|
|
||||||
bool *exceed_err_limit);
|
|
||||||
|
|
||||||
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
|
||||||
struct eeprom_table_record *records,
|
struct eeprom_table_record *records,
|
||||||
|
|||||||
Reference in New Issue
Block a user