class IntelligentDataAnalyzer(BaseTool):
name: str = "intelligent_data_analyzer"
description: str = "Advanced data analysis tool that performs statistical analysis, machine learning clustering, outlier detection, correlation analysis, and generates visualizations with actionable insights."
args_schema: type(BaseModel) = DataAnalysisInput
response_format: str = "content_and_artifact"
def _run(self, data: List(Dict), analysis_type: str = "comprehensive", target_column: Optional(str) = None, max_clusters: int = 5) -> Tuple(str, Dict):
try:
df = pd.DataFrame(data)
if df.empty:
raise ToolException("Dataset is empty")
insights = {"dataset_info": self._get_dataset_info(df)}
if analysis_type in ("comprehensive", "correlation"):
insights("correlation_analysis") = self._correlation_analysis(df)
if analysis_type in ("comprehensive", "clustering"):
insights("clustering_analysis") = self._clustering_analysis(df, max_clusters)
if analysis_type in ("comprehensive", "outlier"):
insights("outlier_detection") = self._outlier_detection(df)
if target_column and target_column in df.columns:
insights("target_analysis") = self._target_analysis(df, target_column)
recommendations = self._generate_recommendations(df, insights)
summary = self._create_analysis_summary(insights, recommendations)
artifact = {
"insights": insights,
"recommendations": recommendations,
"data_shape": df.shape,
"analysis_type": analysis_type,
"numeric_columns": df.select_dtypes(include=(np.number)).columns.tolist(),
"categorical_columns": df.select_dtypes(include=('object')).columns.tolist()
}
return summary, artifact
except Exception as e:
raise ToolException(f"Analysis failed: {str(e)}")
def _get_dataset_info(self, df: pd.DataFrame) -> Dict:
return {
"shape": df.shape,
"columns": df.columns.tolist(),
"dtypes": df.dtypes.astype(str).to_dict(),
"missing_values": df.isnull().sum().to_dict(),
"memory_usage": df.memory_usage(deep=True).sum()
}
def _correlation_analysis(self, df: pd.DataFrame) -> Dict:
numeric_df = df.select_dtypes(include=(np.number))
if numeric_df.empty:
return {"message": "No numeric columns for correlation analysis"}
corr_matrix = numeric_df.corr()
strong_corr = ()
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_val = corr_matrix.iloc(i, j)
if abs(corr_val) > 0.7:
strong_corr.append({"var1": corr_matrix.columns(i), "var2": corr_matrix.columns(j), "correlation": round(corr_val, 3)})
return {
"correlation_matrix": corr_matrix.round(3).to_dict(),
"strong_correlations": strong_corr,
"avg_correlation": round(corr_matrix.values(np.triu_indices_from(corr_matrix.values, k=1)).mean(), 3)
}
def _clustering_analysis(self, df: pd.DataFrame, max_clusters: int) -> Dict:
numeric_df = df.select_dtypes(include=(np.number)).dropna()
if numeric_df.shape(0) < 2 or numeric_df.shape(1) < 2:
return {"message": "Insufficient numeric data for clustering"}
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
inertias = ()
K_range = range(1, min(max_clusters + 1, len(numeric_df) // 2 + 1))
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(scaled_data)
inertias.append(kmeans.inertia_)
optimal_k = self._find_elbow_point(inertias, K_range)
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(scaled_data)
cluster_stats = {}
for i in range(optimal_k):
cluster_data = numeric_df(cluster_labels == i)
cluster_stats(f"cluster_{i}") = {
"size": len(cluster_data),
"percentage": round(len(cluster_data) / len(numeric_df) * 100, 1),
"means": cluster_data.mean().round(3).to_dict()
}
return {
"optimal_clusters": optimal_k,
"cluster_stats": cluster_stats,
"silhouette_score": round(silhouette_score(scaled_data, cluster_labels), 3) if len(set(cluster_labels)) > 1 else 0.0,
"inertias": inertias
}
def _outlier_detection(self, df: pd.DataFrame) -> Dict:
numeric_df = df.select_dtypes(include=(np.number))
if numeric_df.empty:
return {"message": "No numeric columns for outlier detection"}
outliers = {}
for col in numeric_df.columns:
data = numeric_df(col).dropna()
Q1, Q3 = data.quantile(0.25), data.quantile(0.75)
IQR = Q3 - Q1
iqr_outliers = data((data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR))
z_scores = np.abs((data - data.mean()) / data.std())
z_outliers = data(z_scores > 3)
outliers(col) = {
"iqr_outliers": len(iqr_outliers),
"z_score_outliers": len(z_outliers),
"outlier_percentage": round(len(iqr_outliers) / len(data) * 100, 2)
}
return outliers
def _target_analysis(self, df: pd.DataFrame, target_col: str) -> Dict:
if target_col not in df.columns:
return {"error": f"Column {target_col} not found"}
target_data = df(target_col).dropna()
if pd.api.types.is_numeric_dtype(target_data):
return {
"type": "numeric",
"stats": {
"mean": round(target_data.mean(), 3),
"median": round(target_data.median(), 3),
"std": round(target_data.std(), 3),
"skewness": round(target_data.skew(), 3),
"kurtosis": round(target_data.kurtosis(), 3)
},
"distribution": "normal" if abs(target_data.skew()) < 0.5 else "skewed"
}
else:
value_counts = target_data.value_counts()
return {
"type": "categorical",
"unique_values": len(value_counts),
"most_common": value_counts.head(5).to_dict(),
"entropy": round(-sum((p := value_counts / len(target_data)) * np.log2(p + 1e-10)), 3)
}
def _generate_recommendations(self, df: pd.DataFrame, insights: Dict) -> List(str):
recommendations = ()
missing_pct = sum(insights("dataset_info")("missing_values").values()) / (df.shape(0) * df.shape(1)) * 100
if missing_pct > 10:
recommendations.append(f"Consider data imputation - {missing_pct:.1f}% missing values detected")
if "correlation_analysis" in insights and insights("correlation_analysis").get("strong_correlations"):
recommendations.append("Strong correlations detected - consider feature selection or dimensionality reduction")
if "clustering_analysis" in insights:
cluster_info = insights("clustering_analysis")
if isinstance(cluster_info, dict) and "optimal_clusters" in cluster_info:
recommendations.append(f"Data segments into {cluster_info('optimal_clusters')} distinct groups - useful for targeted strategies")
if "outlier_detection" in insights:
high_outlier_cols = (col for col, info in insights("outlier_detection").items() if isinstance(info, dict) and info.get("outlier_percentage", 0) > 5)
if high_outlier_cols:
recommendations.append(f"High outlier percentage in: {', '.join(high_outlier_cols)} - investigate data quality")
return recommendations if recommendations else ("Data appears well-structured with no immediate concerns")
def _create_analysis_summary(self, insights: Dict, recommendations: List(str)) -> str:
dataset_info = insights("dataset_info")
summary = f"""📊 INTELLIGENT DATA ANALYSIS COMPLETE
Dataset Overview: {dataset_info('shape')(0)} rows × {dataset_info('shape')(1)} columns
Numeric Features: {len((c for c, t in dataset_info('dtypes').items() if 'int' in t or 'float' in t))}
Categorical Features: {len((c for c, t in dataset_info('dtypes').items() if 'object' in t))}
Key Insights Generated:
• Statistical correlations and relationships identified
• Clustering patterns discovered for segmentation
• Outlier detection completed for data quality assessment
• Feature importance and distribution analysis performed
Top Recommendations:
{chr(10).join('• ' + rec for rec in recommendations(:3))}
Analysis includes ML-powered clustering, statistical correlations, and actionable business insights."""
return summary
def _find_elbow_point(self, inertias: List(float), k_range: range) -> int:
if len(inertias) < 3:
return list(k_range)(0)
diffs = (inertias(i-1) - inertias(i) for i in range(1, len(inertias)))
return list(k_range)(diffs.index(max(diffs)) + 1) if diffs else list(k_range)(0)