!pip install folium
!pip install hdbscan
!pip install scikit-learn

Requirement already satisfied: folium in c:\users\anhbv\anaconda3\lib\site-packages (0.19.6)
Requirement already satisfied: branca>=0.6.0 in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (0.8.1)
Requirement already satisfied: jinja2>=2.9 in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (3.1.4)
Requirement already satisfied: numpy in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (1.26.4)
Requirement already satisfied: requests in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (2.32.3)
Requirement already satisfied: xyzservices in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (2022.9.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\anhbv\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (2025.1.31)
Collecting hdbscan
  Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata (15 kB)
Requirement already satisfied: numpy<3,>=1.20 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.26.4)
Requirement already satisfied: scipy>=1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.13.1)
Requirement already satisfied: scikit-learn>=0.20 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.5.1)
Requirement already satisfied: joblib>=1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn>=0.20->hdbscan) (3.5.0)
Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl (726 kB)
   ---------------------------------------- 0.0/726.2 kB ? eta -:--:--
   ---------------------------------------- 726.2/726.2 kB 9.9 MB/s eta 0:00:00
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.40
Requirement already satisfied: scikit-learn in c:\users\anhbv\anaconda3\lib\site-packages (1.5.1)
Requirement already satisfied: numpy>=1.19.5 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.26.4)
Requirement already satisfied: scipy>=1.6.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)

import pandas as pd
import folium
import hdbscan
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

file_path = r'C:\Users\anhbv\Downloads\Reported_Crime__STARS_Category_Offenses__20250524.csv'
# Load the dataset
df = pd.read_csv(file_path)  # skips problematic rows
# Preview the data
df.head()

df_clean = df.dropna().drop_duplicates()
print("Rows before:", len(df))
print("Rows after cleaning:", len(df_clean))

Rows before: 24186
Rows after cleaning: 19134

# Extract coordinates
coords = df_clean[['LATITUDE_X', 'LONGITUDE_X']].copy()

# Standardize the data
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

# Fit HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=50, min_samples=10)
cluster_labels = clusterer.fit_predict(coords_scaled)

# Attach labels back to the dataframe
df_clean['cluster'] = cluster_labels

# Define a color map
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
colormap = cm.get_cmap('Set1', num_clusters)
norm = colors.Normalize(vmin=0, vmax=num_clusters)

# Function to assign color
def get_color(label):
    if label == -1:
        return 'gray'
    rgb = colormap(norm(label))[:3]
    return f'#{int(rgb[0]*255):02x}{int(rgb[1]*255):02x}{int(rgb[2]*255):02x}'

# Create the map again
cincinnati_map = folium.Map(location=[39.1031, -84.5120], zoom_start=12)
incidents = plugins.MarkerCluster().add_to(cincinnati_map)

# Add color-coded markers
for lat, lng, label in zip(df_clean['LATITUDE_X'], df_clean['LONGITUDE_X'], df_clean['cluster']):
    folium.CircleMarker(
        location=[lat, lng],
        radius=3,
        color=get_color(label),
        fill=True,
        fill_opacity=0.6,
        popup=f'Cluster: {label}'
    ).add_to(incidents)

cincinnati_map

C:\Users\anhbv\AppData\Local\Temp\ipykernel_3892\1453171595.py:3: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
  colormap = cm.get_cmap('Set1', num_clusters)

df_seg = df_clean[['Hour_From', 'Hour_To', 'DAY_OF_WEEK', 'BEAT',
                   'STARS_CATEGORY', 'CPD_Neighborhood']].dropna()

df_seg = df_seg[df_seg['STARS_CATEGORY'] != 'Part 2']

# Define categorical and numeric columns
categorical_cols = ['DAY_OF_WEEK', 'BEAT', 'STARS_CATEGORY', 'CPD_Neighborhood']
numerical_cols = ['Hour_From', 'Hour_To']

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(df_seg[categorical_cols])

# Combine with numerical features
X_seg = np.hstack([df_seg[numerical_cols].values, encoded_cats])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_seg)

kmeans = KMeans(n_clusters=6, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Attach labels
df_seg['cluster'] = cluster_labels

cluster_summary = df_seg.groupby('cluster')[['Hour_From']].mean().round(2)
cluster_counts = df_seg['cluster'].value_counts()
cluster_modes = df_seg.groupby('cluster')[['STARS_CATEGORY', 'DAY_OF_WEEK', 'CPD_Neighborhood']].agg(lambda x: x.mode()[0])

print("Cluster Summary (Averages):\n", cluster_summary)
print("\nCluster Sizes:\n", cluster_counts)
print("\nMost Common Category, Day, Neighborhood:\n", cluster_modes)

Cluster Summary (Averages):
          Hour_From
cluster           
0            12.98
1            12.10
2            12.96
3            12.05
4            11.96
5            12.62

Cluster Sizes:
 cluster
3    4733
0    3828
1    1121
4     669
2     549
5     185
Name: count, dtype: int64

Most Common Category, Day, Neighborhood:
                STARS_CATEGORY DAY_OF_WEEK CPD_Neighborhood
cluster                                                   
0        Personal/Other Theft      Monday         WESTWOOD
1        Personal/Other Theft      Monday  WEST PRICE HILL
2        Personal/Other Theft     Tuesday              CUF
3        Personal/Other Theft      Monday   CBD/RIVERFRONT
4             Theft from Auto      Monday   OVER-THE-RHINE
5                  Auto Theft      Monday     WINTON HILLS

# Select relevant columns and drop nulls
df_class = df_clean[['STARS_CATEGORY', 'Hour_From', 'DAY_OF_WEEK', 'SECTION',
                     'BEAT', 'CPD_Neighborhood', 'CLSD']].dropna()

cat_cols = ['DAY_OF_WEEK', 'BEAT', 'CPD_Neighborhood', 'CLSD']
num_cols = ['Hour_From', 'SECTION']

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat = encoder.fit_transform(df_class[cat_cols])

import numpy as np
X = np.hstack([df_class[num_cols].values, encoded_cat])

y = (df_class['STARS_CATEGORY'] == 'Part 1').astype(int)  # 1 = Part 1, 0 = Part 2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

RandomForestClassifier(random_state=42)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Part 2', 'Part 1']))

Accuracy: 0.8690880585314869

Confusion Matrix:
 [[3184  191]
 [ 310  142]]

Classification Report:
               precision    recall  f1-score   support

      Part 2       0.91      0.94      0.93      3375
      Part 1       0.43      0.31      0.36       452

    accuracy                           0.87      3827
   macro avg       0.67      0.63      0.64      3827
weighted avg       0.85      0.87      0.86      3827

	Incident_No	DateReported	DateFrom	DateTo	CLSD	BEAT	SECTION	STARS_CATEGORY	DAY_OF_WEEK	CPD_Neighborhood	Hour_From	Hour_To	LATITUDE_X	LONGITUDE_X	Address_X	type	NIBRSUCRCODE	SNA_Neighborhood	CC_Neighborhood
0	2024-INC-000149	6/4/2024 14:33	5/3/2024 6:00	5/3/2024 8:00	EARLY_CLOSED	CBS	2913.49	Part 2	Tuesday	CBD/RIVERFRONT	6.0	8.0	39.1067	-84.5097	2XX E 9TH ST	Part 2	26F	Downtown	Downtown
1	2024-INC-000477	6/8/2024 12:22	4/20/2024 0:00	4/21/2024 0:00	EARLY_CLOSED	D3B2	2913.49	Part 2	Saturday	EAST PRICE HILL	0.0	0.0	39.1047	-84.5696	7XX FAIRBANKS AVE	Part 2	26F	East Price Hill	East Price Hill
2	2024-INC-001607	6/21/2024 14:42	6/20/2024 14:08	6/20/2024 14:25	EARLY_CLOSED	D3B4	2913.49	Part 2	Friday	MILLVALE	14.0	14.0	39.1488	-84.5523	19XX MILLVALE CT	Part 2	26F	Millvale	Millvale
3	2024-INC-001801	6/23/2024 14:47	5/28/2024 0:00	6/4/2024 23:59	EARLY_CLOSED	D3B4	2913.49	Part 2	Sunday	ENGLISH WOODS	0.0	23.0	39.1414	-84.5585	19XX SUTTER AVE	Part 2	26F	English Woods_North Fairmount	English Woods
4	2024-INC-002557	7/2/2024 23:51	7/2/2024 13:20	7/2/2024 23:51	EARLY_CLOSED	D4B2	2913.49	Part 2	Tuesday	SPRING GROVE VILLAGE	13.0	23.0	39.1718	-84.5175	7XX MCMAKIN AVE	Part 2	26F	Spring Grove Village	Spring Grove Village

Clustering Crime Hotspots¶

Crime Pattern Segmentation¶

Crime Type Classification¶