Kann mir jemand erklären, was diese Funktion macht? Wie ich weiß, überprüft es, ob eine Zeile in CSV dupliziert ist oder nicht. Aber ich möchte nur überprüfen, ob eine bestimmte Spalte doppelte Werte hat oder nicht. Wie mache ich das?Datenrahmen in Python verstehen
@Validator
def hasDuplicates(fileInDf, fileType = File_Name_All, kwargs = def_kwargs):
''' Return row indexes that are duplicates '''
import pandas
if fileInDf is None:
return ValidatorResponse(rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'Input File is not a valid file for rule : hasDuplicates')
if type(fileInDf) is not pandas.DataFrame:
return ValidatorResponse(rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'Type %s is not a valid DataFrame Type for rule : hasDuplicates' % type(fileInDf))
if fileInDf.empty:
return ValidatorResponse(rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'Input File is not a valid file for rule : hasDuplicates')
dups = fileInDf.duplicated()
indexes = dups[ dups == True ].index.tolist()
fixedDf = fileInDf.drop_duplicates()
ret = Rule_Decision.FAILED if len(fixedDf) != len(fileInDf) else Rule_Decision.SUCCESS
return ValidatorResponse(rule_decision = ret, rule_return_fixedDf = fixedDf, rule_return_val = indexes)
UPDATE:
@Validator
def hasDuplicatesSingleColumn(val, fileInDf, fileType = File_Name_All, kwargs = def_kwargs):
''' Return row indexes that are duplicates '''
import pandas
if fileInDf is None:
return ValidatorResponse(rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'Input File is not a valid file for rule : hasDuplicates')
if type(fileInDf) is not pandas.DataFrame:
return ValidatorResponse(rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'Type %s is not a valid DataFrame Type for rule : hasDuplicates' % type(fileInDf))
if fileInDf.empty:
return ValidatorResponse(rule_decision = Rule_Decision.INVALID_INPUT, rule_return_message = 'Input File is not a valid file for rule : hasDuplicates')
col_dups = fileInDf[['column']].duplicated()
indexes = col_dups[ col_dups == True ].index.tolist()
new_df = fileInDf[['column']].drop_duplicates()
ret = Rule_Decision.FAILED if len(new_df) != len(fileInDf) else Rule_Decision.SUCCESS
return ValidatorResponse(rule_decision = ret, rule_return_fixedDf = new_df, rule_return_val = indexes)
Aber wie bekomme ich die Indizes? Ist es der richtige Weg in der obigen Funktion?
Können Sie einen Kontext bereitstellen? Woher kommen Dinge wie "Validator" und "ValidatorResponse"? Verwenden Sie zusätzlich zu Pandas ein bestimmtes Paket? –