In [1]:
!pip install folium
!pip install hdbscan
!pip install scikit-learn
Requirement already satisfied: folium in c:\users\anhbv\anaconda3\lib\site-packages (0.19.6) Requirement already satisfied: branca>=0.6.0 in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (0.8.1) Requirement already satisfied: jinja2>=2.9 in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (3.1.4) Requirement already satisfied: numpy in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (1.26.4) Requirement already satisfied: requests in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (2.32.3) Requirement already satisfied: xyzservices in c:\users\anhbv\anaconda3\lib\site-packages (from folium) (2022.9.0) Requirement already satisfied: MarkupSafe>=2.0 in c:\users\anhbv\anaconda3\lib\site-packages (from jinja2>=2.9->folium) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\anhbv\anaconda3\lib\site-packages (from requests->folium) (2025.1.31) Collecting hdbscan Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata (15 kB) Requirement already satisfied: numpy<3,>=1.20 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.26.4) Requirement already satisfied: scipy>=1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.13.1) Requirement already satisfied: scikit-learn>=0.20 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.5.1) Requirement already satisfied: joblib>=1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from hdbscan) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn>=0.20->hdbscan) (3.5.0) Downloading hdbscan-0.8.40-cp312-cp312-win_amd64.whl (726 kB) ---------------------------------------- 0.0/726.2 kB ? eta -:--:-- ---------------------------------------- 726.2/726.2 kB 9.9 MB/s eta 0:00:00 Installing collected packages: hdbscan Successfully installed hdbscan-0.8.40 Requirement already satisfied: scikit-learn in c:\users\anhbv\anaconda3\lib\site-packages (1.5.1) Requirement already satisfied: numpy>=1.19.5 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.26.4) Requirement already satisfied: scipy>=1.6.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.13.1) Requirement already satisfied: joblib>=1.2.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\anhbv\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)
In [81]:
import pandas as pd
import folium
import hdbscan
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
In [83]:
file_path = r'C:\Users\anhbv\Downloads\Reported_Crime__STARS_Category_Offenses__20250524.csv'
# Load the dataset
df = pd.read_csv(file_path) # skips problematic rows
# Preview the data
df.head()
Out[83]:
Incident_No | DateReported | DateFrom | DateTo | CLSD | BEAT | SECTION | STARS_CATEGORY | DAY_OF_WEEK | CPD_Neighborhood | Hour_From | Hour_To | LATITUDE_X | LONGITUDE_X | Address_X | type | NIBRSUCRCODE | SNA_Neighborhood | CC_Neighborhood | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2024-INC-000149 | 6/4/2024 14:33 | 5/3/2024 6:00 | 5/3/2024 8:00 | EARLY_CLOSED | CBS | 2913.49 | Part 2 | Tuesday | CBD/RIVERFRONT | 6.0 | 8.0 | 39.1067 | -84.5097 | 2XX E 9TH ST | Part 2 | 26F | Downtown | Downtown |
1 | 2024-INC-000477 | 6/8/2024 12:22 | 4/20/2024 0:00 | 4/21/2024 0:00 | EARLY_CLOSED | D3B2 | 2913.49 | Part 2 | Saturday | EAST PRICE HILL | 0.0 | 0.0 | 39.1047 | -84.5696 | 7XX FAIRBANKS AVE | Part 2 | 26F | East Price Hill | East Price Hill |
2 | 2024-INC-001607 | 6/21/2024 14:42 | 6/20/2024 14:08 | 6/20/2024 14:25 | EARLY_CLOSED | D3B4 | 2913.49 | Part 2 | Friday | MILLVALE | 14.0 | 14.0 | 39.1488 | -84.5523 | 19XX MILLVALE CT | Part 2 | 26F | Millvale | Millvale |
3 | 2024-INC-001801 | 6/23/2024 14:47 | 5/28/2024 0:00 | 6/4/2024 23:59 | EARLY_CLOSED | D3B4 | 2913.49 | Part 2 | Sunday | ENGLISH WOODS | 0.0 | 23.0 | 39.1414 | -84.5585 | 19XX SUTTER AVE | Part 2 | 26F | English Woods_North Fairmount | English Woods |
4 | 2024-INC-002557 | 7/2/2024 23:51 | 7/2/2024 13:20 | 7/2/2024 23:51 | EARLY_CLOSED | D4B2 | 2913.49 | Part 2 | Tuesday | SPRING GROVE VILLAGE | 13.0 | 23.0 | 39.1718 | -84.5175 | 7XX MCMAKIN AVE | Part 2 | 26F | Spring Grove Village | Spring Grove Village |
In [4]:
df_clean = df.dropna().drop_duplicates()
print("Rows before:", len(df))
print("Rows after cleaning:", len(df_clean))
Rows before: 24186 Rows after cleaning: 19134
Clustering Crime Hotspots¶
In [13]:
# Extract coordinates
coords = df_clean[['LATITUDE_X', 'LONGITUDE_X']].copy()
# Standardize the data
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)
In [15]:
# Fit HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=50, min_samples=10)
cluster_labels = clusterer.fit_predict(coords_scaled)
# Attach labels back to the dataframe
df_clean['cluster'] = cluster_labels
In [19]:
# Define a color map
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
colormap = cm.get_cmap('Set1', num_clusters)
norm = colors.Normalize(vmin=0, vmax=num_clusters)
# Function to assign color
def get_color(label):
if label == -1:
return 'gray'
rgb = colormap(norm(label))[:3]
return f'#{int(rgb[0]*255):02x}{int(rgb[1]*255):02x}{int(rgb[2]*255):02x}'
# Create the map again
cincinnati_map = folium.Map(location=[39.1031, -84.5120], zoom_start=12)
incidents = plugins.MarkerCluster().add_to(cincinnati_map)
# Add color-coded markers
for lat, lng, label in zip(df_clean['LATITUDE_X'], df_clean['LONGITUDE_X'], df_clean['cluster']):
folium.CircleMarker(
location=[lat, lng],
radius=3,
color=get_color(label),
fill=True,
fill_opacity=0.6,
popup=f'Cluster: {label}'
).add_to(incidents)
cincinnati_map
C:\Users\anhbv\AppData\Local\Temp\ipykernel_3892\1453171595.py:3: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead. colormap = cm.get_cmap('Set1', num_clusters)
Out[19]:
Make this Notebook Trusted to load map: File -> Trust Notebook