def_handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame: """Handle missing values based on field requirements""" for column, rules in self.schema.items(): if column in df.columns: if rules.get('required', False): # Remove rows with missing required fields missing_count = df[column].isnull().sum() if missing_count > 0: self.errors.append(f"Removed {missing_count} rows with missing {column}") df = df.dropna(subset=[column]) else: # Fill optional missing values if df[column].dtype in ['int64', 'float64']: df[column].fillna(df[column].median(), inplace=True) else: df[column].fillna('Unknown', inplace=True) return df
数据类型验证
以下代码将列转换为指定类型并删除转换失败的行。
def_validate_data_types(self, df: pd.DataFrame) -> pd.DataFrame: """Convert and validate data types""" for column, rules in self.schema.items(): if column in df.columns: expected_type = rules['type'] try: if expected_type == 'datetime': df[column] = pd.to_datetime(df[column], errors='coerce') elif expected_type == int: df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64') elif expected_type == float: df[column] = pd.to_numeric(df[column], errors='coerce') # Remove rows with conversion failures invalid_count = df[column].isnull().sum() if invalid_count > 0: self.errors.append(f"Removed {invalid_count} rows with invalid {column}") df = df.dropna(subset=[column]) except Exception as e: self.logger.error(f"Type conversion error for {column}: {e}") return df
添加带有错误跟踪的验证
我们的约束验证系统确保数据在限制范围内并且格式可接受:
def_apply_constraints(self, df: pd.DataFrame) -> pd.DataFrame: """Apply field-specific constraints""" for column, rules in self.schema.items(): if column in df.columns: initial_count = len(df) # Range validation if'min_value'in rules: df = df[df[column] >= rules['min_value']] if'max_value'in rules: df = df[df[column] <= rules['max_value']] # Pattern validation for strings if'pattern'in rules and df[column].dtype == 'object': import re pattern = re.compile(rules['pattern']) df = df[df[column].astype(str).str.match(pattern, na=False)] removed_count = initial_count - len(df) if removed_count > 0: self.errors.append(f"Removed {removed_count} rows failing {column} constraints") return df
基于约束和跨字段验证
当考虑多个字段之间的关系时,通常需要高级验证:
def_cross_field_validation(self, df: pd.DataFrame) -> pd.DataFrame: """Validate relationships between fields""" initial_count = len(df) # Example: Signup date should not be in the future if'signup_date'in df.columns: future_signups = df['signup_date'] > datetime.now() df = df[~future_signups] removed = future_signups.sum() if removed > 0: self.errors.append(f"Removed {removed} rows with future signup dates") # Example: Age consistency with signup date if'age'in df.columns and'signup_date'in df.columns: # Remove records where age seems inconsistent with signup timing suspicious_age = (df['age'] 13) & (df['signup_date'] < datetime(2010, 1, 1)) df = df[~suspicious_age] removed = suspicious_age.sum() if removed > 0: self.errors.append(f"Removed {removed} rows with suspicious age/date combinations") return df