In [1]:
!pip install folium
!pip install hdbscan
!pip install scikit-learn
Requirement already satisfied: folium in c:\users\anhbv\anaconda3\lib\site-packages (0.19.6)
Requirement already satisfied: branca>=0.6.0 in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (0.8.1)
Requirement already satisfied: jinja2>=2.9 in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (3.1.4)
Requirement already satisfied: numpy in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (1.26.4)
Requirement already satisfied: requests in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (2.32.3)
Requirement already satisfied: xyzservices in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (2022.9.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\anhbv\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.3)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (3.7)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (2.2.3)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (2025.1.31)
Collecting hdbscan
  Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata (15 kB)
Requirement already satisfied: numpy<3,>=1.20 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.26.4)
Requirement already satisfied: scipy>=1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.13.1)
Requirement already satisfied: scikit-learn>=0.20 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.5.1)
Requirement already satisfied: joblib>=1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn>=0.20->hdbscan) (3.5.0)
Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl (726 kB)
   ---------------------------------------- 0.0/726.2 kB ? eta -:--:--
   ---------------------------------------- 726.2/726.2 kB 9.9 MB/s eta 0:00:00
Installing collected packages: hdbscan
Successfully installed hdbscan-0.8.40
Requirement already satisfied: scikit-learn in c:\users\anhbv\anaconda3\lib\site-packages (1.5.1)
Requirement already satisfied: numpy>=1.19.5 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.26.4)
Requirement already satisfied: scipy>=1.6.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)
In [81]:
import pandas as pd
import folium
import hdbscan
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
In [83]:
file_path = r'C:\Users\anhbv\Downloads\Reported_Crime__STARS_Category_Offenses__20250524.csv'
# Load the dataset
df = pd.read_csv(file_path)  # skips problematic rows
# Preview the data
df.head()
Out[83]:
Incident_No DateReported DateFrom DateTo CLSD BEAT SECTION STARS_CATEGORY DAY_OF_WEEK CPD_Neighborhood Hour_From Hour_To LATITUDE_X LONGITUDE_X Address_X type NIBRSUCRCODE SNA_Neighborhood CC_Neighborhood
0 2024-INC-000149 6/4/2024 14:33 5/3/2024 6:00 5/3/2024 8:00 EARLY_CLOSED CBS 2913.49 Part 2 Tuesday CBD/RIVERFRONT 6.0 8.0 39.1067 -84.5097 2XX E 9TH ST Part 2 26F Downtown Downtown
1 2024-INC-000477 6/8/2024 12:22 4/20/2024 0:00 4/21/2024 0:00 EARLY_CLOSED D3B2 2913.49 Part 2 Saturday EAST PRICE HILL 0.0 0.0 39.1047 -84.5696 7XX FAIRBANKS AVE Part 2 26F East Price Hill East Price Hill
2 2024-INC-001607 6/21/2024 14:42 6/20/2024 14:08 6/20/2024 14:25 EARLY_CLOSED D3B4 2913.49 Part 2 Friday MILLVALE 14.0 14.0 39.1488 -84.5523 19XX MILLVALE CT Part 2 26F Millvale Millvale
3 2024-INC-001801 6/23/2024 14:47 5/28/2024 0:00 6/4/2024 23:59 EARLY_CLOSED D3B4 2913.49 Part 2 Sunday ENGLISH WOODS 0.0 23.0 39.1414 -84.5585 19XX SUTTER AVE Part 2 26F English Woods_North Fairmount English Woods
4 2024-INC-002557 7/2/2024 23:51 7/2/2024 13:20 7/2/2024 23:51 EARLY_CLOSED D4B2 2913.49 Part 2 Tuesday SPRING GROVE VILLAGE 13.0 23.0 39.1718 -84.5175 7XX MCMAKIN AVE Part 2 26F Spring Grove Village Spring Grove Village
In [4]:
df_clean = df.dropna().drop_duplicates()
print("Rows before:", len(df))
print("Rows after cleaning:", len(df_clean))
Rows before: 24186
Rows after cleaning: 19134

Clustering Crime Hotspots¶

In [13]:
# Extract coordinates
coords = df_clean[['LATITUDE_X', 'LONGITUDE_X']].copy()

# Standardize the data
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)
In [15]:
# Fit HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=50, min_samples=10)
cluster_labels = clusterer.fit_predict(coords_scaled)

# Attach labels back to the dataframe
df_clean['cluster'] = cluster_labels
In [19]:
# Define a color map
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
colormap = cm.get_cmap('Set1', num_clusters)
norm = colors.Normalize(vmin=0, vmax=num_clusters)

# Function to assign color
def get_color(label):
    if label == -1:
        return 'gray'
    rgb = colormap(norm(label))[:3]
    return f'#{int(rgb[0]*255):02x}{int(rgb[1]*255):02x}{int(rgb[2]*255):02x}'

# Create the map again
cincinnati_map = folium.Map(location=[39.1031, -84.5120], zoom_start=12)
incidents = plugins.MarkerCluster().add_to(cincinnati_map)

# Add color-coded markers
for lat, lng, label in zip(df_clean['LATITUDE_X'], df_clean['LONGITUDE_X'], df_clean['cluster']):
    folium.CircleMarker(
        location=[lat, lng],
        radius=3,
        color=get_color(label),
        fill=True,
        fill_opacity=0.6,
        popup=f'Cluster: {label}'
    ).add_to(incidents)

cincinnati_map
C:\Users\anhbv\AppData\Local\Temp\ipykernel_3892\1453171595.py:3: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.
  colormap = cm.get_cmap('Set1', num_clusters)
Out[19]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Crime Pattern Segmentation¶

In [54]:
df_seg = df_clean[['Hour_From', 'Hour_To', 'DAY_OF_WEEK', 'BEAT',
                   'STARS_CATEGORY', 'CPD_Neighborhood']].dropna()

df_seg = df_seg[df_seg['STARS_CATEGORY'] != 'Part 2']

# Define categorical and numeric columns
categorical_cols = ['DAY_OF_WEEK', 'BEAT', 'STARS_CATEGORY', 'CPD_Neighborhood']
numerical_cols = ['Hour_From', 'Hour_To']

# One-hot encode categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cats = encoder.fit_transform(df_seg[categorical_cols])

# Combine with numerical features
X_seg = np.hstack([df_seg[numerical_cols].values, encoded_cats])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_seg)
In [56]:
kmeans = KMeans(n_clusters=6, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Attach labels
df_seg['cluster'] = cluster_labels
In [64]:
cluster_summary = df_seg.groupby('cluster')[['Hour_From']].mean().round(2)
cluster_counts = df_seg['cluster'].value_counts()
cluster_modes = df_seg.groupby('cluster')[['STARS_CATEGORY', 'DAY_OF_WEEK', 'CPD_Neighborhood']].agg(lambda x: x.mode()[0])

print("Cluster Summary (Averages):\n", cluster_summary)
print("\nCluster Sizes:\n", cluster_counts)
print("\nMost Common Category, Day, Neighborhood:\n", cluster_modes)
Cluster Summary (Averages):
          Hour_From
cluster           
0            12.98
1            12.10
2            12.96
3            12.05
4            11.96
5            12.62

Cluster Sizes:
 cluster
3    4733
0    3828
1    1121
4     669
2     549
5     185
Name: count, dtype: int64

Most Common Category, Day, Neighborhood:
                STARS_CATEGORY DAY_OF_WEEK CPD_Neighborhood
cluster                                                   
0        Personal/Other Theft      Monday         WESTWOOD
1        Personal/Other Theft      Monday  WEST PRICE HILL
2        Personal/Other Theft     Tuesday              CUF
3        Personal/Other Theft      Monday   CBD/RIVERFRONT
4             Theft from Auto      Monday   OVER-THE-RHINE
5                  Auto Theft      Monday     WINTON HILLS

Crime Type Classification¶

In [67]:
# Select relevant columns and drop nulls
df_class = df_clean[['STARS_CATEGORY', 'Hour_From', 'DAY_OF_WEEK', 'SECTION',
                     'BEAT', 'CPD_Neighborhood', 'CLSD']].dropna()
In [69]:
cat_cols = ['DAY_OF_WEEK', 'BEAT', 'CPD_Neighborhood', 'CLSD']
num_cols = ['Hour_From', 'SECTION']

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat = encoder.fit_transform(df_class[cat_cols])

import numpy as np
X = np.hstack([df_class[num_cols].values, encoded_cat])
In [91]:
y = (df_class['STARS_CATEGORY'] == 'Part 1').astype(int)  # 1 = Part 1, 0 = Part 2
In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [85]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
Out[85]:
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [87]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Part 2', 'Part 1']))
Accuracy: 0.8690880585314869

Confusion Matrix:
 [[3184  191]
 [ 310  142]]

Classification Report:
               precision    recall  f1-score   support

      Part 2       0.91      0.94      0.93      3375
      Part 1       0.43      0.31      0.36       452

    accuracy                           0.87      3827
   macro avg       0.67      0.63      0.64      3827
weighted avg       0.85      0.87      0.86      3827

In [ ]: