drm/amdgpu: fix send ras disable cmd when asic not support ras
cause:
It is necessary to send ras disable command to ras-ta during gfx
block ras later init, because the ras capability is disable read
from vbios for vega20 gaming, but the ras context is released
during ras init process, this will cause send ras disable command
to ras-to failed.
how:
Delay releasing ras context, the ras context
will be released after gfx block later init done.
Changed from V1:
move release_ras_context into ras_resume
Changed from V2:
check BIT(UMC) is more reasonable before access eeprom table
Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
0e4c0ae59d
commit
970fd19764
@@ -3521,11 +3521,11 @@ fence_driver_init:
|
|||||||
adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
|
adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
|
||||||
adev->virt.ops = NULL;
|
adev->virt.ops = NULL;
|
||||||
r = -EAGAIN;
|
r = -EAGAIN;
|
||||||
goto failed;
|
goto release_ras_con;
|
||||||
}
|
}
|
||||||
dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
|
dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
|
||||||
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
|
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
|
||||||
goto failed;
|
goto release_ras_con;
|
||||||
}
|
}
|
||||||
|
|
||||||
dev_info(adev->dev,
|
dev_info(adev->dev,
|
||||||
@@ -3591,7 +3591,7 @@ fence_driver_init:
|
|||||||
if (r) {
|
if (r) {
|
||||||
dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
|
dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
|
||||||
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
|
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
|
||||||
goto failed;
|
goto release_ras_con;
|
||||||
}
|
}
|
||||||
/* must succeed. */
|
/* must succeed. */
|
||||||
amdgpu_ras_resume(adev);
|
amdgpu_ras_resume(adev);
|
||||||
@@ -3625,6 +3625,9 @@ fence_driver_init:
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
release_ras_con:
|
||||||
|
amdgpu_release_ras_context(adev);
|
||||||
|
|
||||||
failed:
|
failed:
|
||||||
amdgpu_vf_error_trans_all(adev);
|
amdgpu_vf_error_trans_all(adev);
|
||||||
if (atpx)
|
if (atpx)
|
||||||
|
|||||||
@@ -463,7 +463,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
|
|||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
struct ras_manager *obj;
|
struct ras_manager *obj;
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
|
if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
|
||||||
@@ -490,7 +490,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
|
|||||||
struct ras_manager *obj;
|
struct ras_manager *obj;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
if (head) {
|
if (head) {
|
||||||
@@ -590,7 +590,11 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
|
|||||||
con->features |= BIT(head->block);
|
con->features |= BIT(head->block);
|
||||||
} else {
|
} else {
|
||||||
if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
|
if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
|
||||||
con->features &= ~BIT(head->block);
|
/* skip clean gfx ras context feature for VEGA20 Gaming.
|
||||||
|
* will clean later
|
||||||
|
*/
|
||||||
|
if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)))
|
||||||
|
con->features &= ~BIT(head->block);
|
||||||
put_obj(obj);
|
put_obj(obj);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -693,6 +697,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
|
|||||||
if (ret)
|
if (ret)
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
|
/* gfx block ras dsiable cmd must send to ras-ta */
|
||||||
|
if (head->block == AMDGPU_RAS_BLOCK__GFX)
|
||||||
|
con->features |= BIT(head->block);
|
||||||
|
|
||||||
ret = amdgpu_ras_feature_enable(adev, head, 0);
|
ret = amdgpu_ras_feature_enable(adev, head, 0);
|
||||||
}
|
}
|
||||||
} else
|
} else
|
||||||
@@ -948,7 +956,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
|
|||||||
struct ras_manager *obj;
|
struct ras_manager *obj;
|
||||||
struct ras_err_data data = {0, 0};
|
struct ras_err_data data = {0, 0};
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
list_for_each_entry(obj, &con->head, node) {
|
list_for_each_entry(obj, &con->head, node) {
|
||||||
@@ -1469,7 +1477,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
|
|||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
struct ras_manager *obj;
|
struct ras_manager *obj;
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
list_for_each_entry(obj, &con->head, node) {
|
list_for_each_entry(obj, &con->head, node) {
|
||||||
@@ -1517,7 +1525,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)
|
|||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
struct ras_manager *obj;
|
struct ras_manager *obj;
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
list_for_each_entry(obj, &con->head, node) {
|
list_for_each_entry(obj, &con->head, node) {
|
||||||
@@ -1830,7 +1838,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
|||||||
bool exc_err_limit = false;
|
bool exc_err_limit = false;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (con)
|
if (adev->ras_features && con)
|
||||||
data = &con->eh_data;
|
data = &con->eh_data;
|
||||||
else
|
else
|
||||||
return 0;
|
return 0;
|
||||||
@@ -2005,6 +2013,15 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
|
|||||||
amdgpu_ras_check_supported(adev, &con->hw_supported,
|
amdgpu_ras_check_supported(adev, &con->hw_supported,
|
||||||
&con->supported);
|
&con->supported);
|
||||||
if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
|
if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) {
|
||||||
|
/* set gfx block ras context feature for VEGA20 Gaming
|
||||||
|
* send ras disable cmd to ras ta during ras late init.
|
||||||
|
*/
|
||||||
|
if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) {
|
||||||
|
con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
r = 0;
|
r = 0;
|
||||||
goto release_con;
|
goto release_con;
|
||||||
}
|
}
|
||||||
@@ -2118,8 +2135,12 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
|
|||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
struct ras_manager *obj, *tmp;
|
struct ras_manager *obj, *tmp;
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con) {
|
||||||
|
/* clean ras context for VEGA20 Gaming after send ras disable cmd */
|
||||||
|
amdgpu_release_ras_context(adev);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
|
if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
|
||||||
/* Set up all other IPs which are not implemented. There is a
|
/* Set up all other IPs which are not implemented. There is a
|
||||||
@@ -2160,7 +2181,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)
|
|||||||
{
|
{
|
||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
amdgpu_ras_disable_all_features(adev, 0);
|
amdgpu_ras_disable_all_features(adev, 0);
|
||||||
@@ -2174,7 +2195,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
|
|||||||
{
|
{
|
||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/* Need disable ras on all IPs here before ip [hw/sw]fini */
|
/* Need disable ras on all IPs here before ip [hw/sw]fini */
|
||||||
@@ -2187,7 +2208,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
|
|||||||
{
|
{
|
||||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
|
|
||||||
if (!con)
|
if (!adev->ras_features || !con)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
amdgpu_ras_fs_fini(adev);
|
amdgpu_ras_fs_fini(adev);
|
||||||
@@ -2230,3 +2251,17 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)
|
|||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void amdgpu_release_ras_context(struct amdgpu_device *adev)
|
||||||
|
{
|
||||||
|
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||||
|
|
||||||
|
if (!con)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {
|
||||||
|
con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);
|
||||||
|
amdgpu_ras_set_context(adev, NULL);
|
||||||
|
kfree(con);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -626,4 +626,6 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
|
|||||||
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);
|
void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);
|
||||||
|
|
||||||
bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev);
|
bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev);
|
||||||
|
|
||||||
|
void amdgpu_release_ras_context(struct amdgpu_device *adev);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -441,7 +441,14 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
|
|||||||
if (!__is_ras_eeprom_supported(adev))
|
if (!__is_ras_eeprom_supported(adev))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (con && (con->eeprom_control.tbl_hdr.header == EEPROM_TABLE_HDR_BAD)) {
|
/* skip check eeprom table for VEGA20 Gaming */
|
||||||
|
if (!con)
|
||||||
|
return false;
|
||||||
|
else
|
||||||
|
if (!(con->features & BIT(AMDGPU_RAS_BLOCK__UMC)))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (con->eeprom_control.tbl_hdr.header == EEPROM_TABLE_HDR_BAD) {
|
||||||
dev_warn(adev->dev, "This GPU is in BAD status.");
|
dev_warn(adev->dev, "This GPU is in BAD status.");
|
||||||
dev_warn(adev->dev, "Please retire it or setting one bigger "
|
dev_warn(adev->dev, "Please retire it or setting one bigger "
|
||||||
"threshold value when reloading driver.\n");
|
"threshold value when reloading driver.\n");
|
||||||
|
|||||||
Reference in New Issue
Block a user