In [67]:
# Essential import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# this is sometimes helpful to suppress scientific notation in the display of arrays
np.set_printoptions(suppress=True)
In [68]:
#import Census package
from census import Census
import time
In [ ]:
#first we are going to make the census API and get the information we want
In [36]:
# pulling ACS data
API_KEY = "KEY"
c = Census(API_KEY, year=2023)
variables = [
"NAME",
# Education Variables
"B15003_001E", "B15003_017E", "B15003_018E", "B15003_019E", "B15003_020E",
"B15003_021E", "B15003_022E", "B15003_023E", "B15003_024E", "B14001_001E",
"B14001_002E", "B14001_008E", "B14002_001E", "B14002_003E", "B14002_004E",
"B14007_001E", "B14007_003E",
# Income Variables
"B19013_001E", "B19301_001E", "B17021_001E", "B17021_002E",
# Health Variables
"B18135_001E", "B18135_007E", "B27001_001E", "B27001_005E", "B27001_033E",
# Immigration Variables
"B05002_001E", "B05002_013E", "B05006_001E", "B05006_005E",
# NEW: Population and Housing
"B02001_001E", "B25003_001E", "B25003_002E", "B25003_003E",
]
# Function to split variable list into chunks of 40 (to avoid API limits)
def chunk_list(lst, size=40):
for i in range(0, len(lst), size):
yield lst[i:i + size]
# Safe request function
def safe_census_request(census_obj, fields, geo, retries=3):
for attempt in range(retries):
try:
return census_obj.acs5.get(fields, geo)
except Exception as e:
print(f"API request failed (Attempt {attempt+1}/{retries}): {e}")
time.sleep(5)
return None
# Pull data for all 5 boroughs
nyc_counties = ["005", "047", "061", "081", "085"]
dfs = []
for county_code in nyc_counties:
print(f"🔄 Pulling data for county: {county_code}...")
county_dfs = []
for chunk in chunk_list(variables, size=40):
geo_filter = {"for": "tract:*", "in": f"state:36 county:{county_code}"}
data = safe_census_request(c, chunk, geo_filter)
if data:
df_chunk = pd.DataFrame(data)
county_dfs.append(df_chunk)
else:
print(f"⚠️ Skipping chunk for county {county_code} due to API failure.")
if county_dfs:
county_df = pd.concat(county_dfs, axis=1)
county_df = county_df.loc[:, ~county_df.columns.duplicated()]
dfs.append(county_df)
# ✅ Combine all counties into one big DataFrame
if dfs:
merged_df = pd.concat(dfs, axis=0)
merged_df.to_csv("census_data_ny_2023.csv", index=False)
print("✅ Full NYC data saved as 'census_data_ny_2023.csv'")
else:
print("🚫 No data retrieved. Check API errors.")
🔄 Pulling data for county: 005... 🔄 Pulling data for county: 047... 🔄 Pulling data for county: 061... 🔄 Pulling data for county: 081... 🔄 Pulling data for county: 085... ✅ Full NYC data saved as 'census_data_ny_2023.csv'
In [ ]:
#in the below section, we are going to clean the data and prepare it for a join with the pluto data
In [69]:
# Read the CSV file
census_df = pd.read_csv("census_data_ny_2023.csv")
# Print the DataFrame
print(census_df)
NAME B15003_001E \ 0 Census Tract 1; Bronx County; New York 3117.0 1 Census Tract 2; Bronx County; New York 3735.0 2 Census Tract 4; Bronx County; New York 4557.0 3 Census Tract 16; Bronx County; New York 3858.0 4 Census Tract 19.01; Bronx County; New York 1710.0 ... ... ... 2322 Census Tract 303.02; Richmond County; New York 4644.0 2323 Census Tract 319.01; Richmond County; New York 2288.0 2324 Census Tract 319.02; Richmond County; New York 2976.0 2325 Census Tract 323; Richmond County; New York 774.0 2326 Census Tract 9901; Richmond County; New York 0.0 B15003_017E B15003_018E B15003_019E B15003_020E B15003_021E \ 0 1022.0 501.0 80.0 209.0 60.0 1 886.0 320.0 26.0 466.0 298.0 2 887.0 74.0 212.0 490.0 390.0 3 967.0 105.0 137.0 361.0 448.0 4 352.0 10.0 35.0 223.0 233.0 ... ... ... ... ... ... 2322 887.0 304.0 206.0 542.0 632.0 2323 369.0 240.0 112.0 165.0 188.0 2324 640.0 214.0 167.0 314.0 140.0 2325 202.0 30.0 15.0 103.0 37.0 2326 0.0 0.0 0.0 0.0 0.0 B15003_022E B15003_023E B15003_024E ... B05002_013E B05006_001E \ 0 18.0 7.0 9.0 ... 396.0 396.0 1 574.0 649.0 0.0 ... 1989.0 1989.0 2 1118.0 391.0 184.0 ... 1027.0 1027.0 3 492.0 213.0 0.0 ... 1810.0 1810.0 4 302.0 206.0 17.0 ... 595.0 595.0 ... ... ... ... ... ... ... 2322 1169.0 297.0 62.0 ... 2128.0 2128.0 2323 168.0 185.0 3.0 ... 522.0 522.0 2324 708.0 234.0 0.0 ... 1122.0 1122.0 2325 109.0 147.0 0.0 ... 211.0 211.0 2326 0.0 0.0 0.0 ... 0.0 0.0 B05006_005E B02001_001E B25003_001E B25003_002E B25003_003E state \ 0 0.0 3538.0 0.0 0.0 0.0 36 1 0.0 5177.0 1446.0 864.0 582.0 36 2 0.0 6481.0 2246.0 1376.0 870.0 36 3 8.0 6011.0 2149.0 496.0 1653.0 36 4 0.0 2401.0 1092.0 0.0 1092.0 36 ... ... ... ... ... ... ... 2322 0.0 6502.0 2140.0 1707.0 433.0 36 2323 0.0 3888.0 1237.0 271.0 966.0 36 2324 0.0 4502.0 1588.0 833.0 755.0 36 2325 0.0 1078.0 453.0 320.0 133.0 36 2326 0.0 0.0 0.0 0.0 0.0 36 county tract 0 5 100 1 5 200 2 5 400 3 5 1600 4 5 1901 ... ... ... 2322 85 30302 2323 85 31901 2324 85 31902 2325 85 32300 2326 85 990100 [2327 rows x 38 columns]
In [70]:
data_df = pd.DataFrame(data)
print(data_df.sample(5))
NAME B15003_001E B15003_017E \ 80 Census Tract 177.02; Richmond County; New York 1844.0 546.0 45 Census Tract 128.04; Richmond County; New York 3453.0 770.0 96 Census Tract 213; Richmond County; New York 3620.0 834.0 53 Census Tract 134; Richmond County; New York 2549.0 629.0 6 Census Tract 17; Richmond County; New York 1139.0 176.0 B15003_018E B15003_019E B15003_020E B15003_021E B15003_022E \ 80 48.0 266.0 181.0 165.0 404.0 45 189.0 317.0 371.0 248.0 617.0 96 501.0 94.0 266.0 240.0 542.0 53 16.0 116.0 272.0 203.0 654.0 6 60.0 47.0 116.0 120.0 273.0 B15003_023E B15003_024E ... B05002_013E B05006_001E B05006_005E \ 80 177.0 9.0 ... 385.0 385.0 0.0 45 228.0 40.0 ... 1936.0 1936.0 0.0 96 521.0 87.0 ... 1167.0 1167.0 0.0 53 284.0 82.0 ... 588.0 588.0 0.0 6 123.0 84.0 ... 300.0 300.0 0.0 B02001_001E B25003_001E B25003_002E B25003_003E state county tract 80 2338.0 942.0 738.0 204.0 36 085 017702 45 4643.0 1490.0 1061.0 429.0 36 085 012804 96 5391.0 1540.0 932.0 608.0 36 085 021300 53 3660.0 1370.0 934.0 436.0 36 085 013400 6 1518.0 612.0 296.0 316.0 36 085 001700 [5 rows x 38 columns]
In [71]:
print(data_df.columns)
Index(['NAME', 'B15003_001E', 'B15003_017E', 'B15003_018E', 'B15003_019E', 'B15003_020E', 'B15003_021E', 'B15003_022E', 'B15003_023E', 'B15003_024E', 'B14001_001E', 'B14001_002E', 'B14001_008E', 'B14002_001E', 'B14002_003E', 'B14002_004E', 'B14007_001E', 'B14007_003E', 'B19013_001E', 'B19301_001E', 'B17021_001E', 'B17021_002E', 'B18135_001E', 'B18135_007E', 'B27001_001E', 'B27001_005E', 'B27001_033E', 'B05002_001E', 'B05002_013E', 'B05006_001E', 'B05006_005E', 'B02001_001E', 'B25003_001E', 'B25003_002E', 'B25003_003E', 'state', 'county', 'tract'], dtype='object')
In [72]:
import pandas as pd
# Read in the dataset
merged_df = pd.read_csv("census_data_ny_2023.csv")
# Rename columns for readability
rename_dict = {
"B02001_001E": "population", # Total population
"B19013_001E": "medhhinc", # Median household income
"B19301_001E": "percapita_income", # Per capita income
"B17021_001E": "poverty_total", # Total population for poverty status
"B17021_002E": "poverty_below", # Population below poverty level
"B25003_001E": "housing_total", # Total housing units
"B25003_002E": "housing_owner", # Owner-occupied housing
"B25003_003E": "housing_renter", # Renter-occupied housing
"B05002_001E": "nativity_total", # Total population for nativity
"B05002_013E": "nativity_foreign", # Foreign-born population
"B15003_017E": "hs_grad", # High school graduate
"B15003_021E": "bachelors_deg", # Bachelor's degree
"B15003_022E": "masters_deg", # Master's degree
"B15003_023E": "prof_school_deg", # Professional school degree
"B15003_024E": "doctorate_deg", # Doctorate degree
"B18135_001E": "disability_total", # Total population with a disability
"B18135_007E": "disability_under18", # Under 18 with a disability
"B27001_001E": "health_ins_total", # Total population for health insurance
"B27001_005E": "health_ins_m18_24", # Males 18–24 with health insurance
"B27001_033E": "health_ins_f18_24", # Females 18–24 with health insurance
}
# Apply renaming
merged_df.rename(columns=rename_dict, inplace=True)
# Keep only the renamed columns + tract identifiers
keep_columns = list(rename_dict.values()) + ["NAME", "state", "county", "tract"]
merged_df = merged_df[keep_columns]
# Track the size of the dataset before drops
print("Before cleaning:", len(merged_df))
# Drop rows where median household income is missing or negative
data_df = merged_df.loc[merged_df["medhhinc"] >= 0]
# Drop any remaining rows with missing values
data_df = data_df.dropna()
# Track size after cleaning
print("After cleaning:", len(data_df))
# Calculate derived columns
data_df["povpctind"] = (data_df["poverty_below"] / data_df["poverty_total"]) * 100
data_df["tenpctown"] = (data_df["housing_owner"] / data_df["housing_total"]) * 100
data_df["tenpctren"] = (data_df["housing_renter"] / data_df["housing_total"]) * 100
data_df["natpctfor"] = (data_df["nativity_foreign"] / data_df["nativity_total"]) * 100
# Save cleaned and processed dataset
data_df.to_csv("mycensusdownload.csv", index=False)
print("✅ Cleaned dataset saved as 'mycensusdownload.csv'")
Before cleaning: 2327 After cleaning: 2192 ✅ Cleaned dataset saved as 'mycensusdownload.csv'
In [73]:
# Load and verify the Census data file that we previously created using the API
census_df = pd.read_csv('mycensusdownload.csv')
#############
# What are some ways to preview this dataset?
# Display basic dataset information
print("Dataset Info:")
census_df.info()
# Display the first few rows of the dataset
print("\nPreview of the dataset:")
census_df.head()
# Display all column names
print("\nColumn Names:")
print(census_df.columns.tolist())
Dataset Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 2192 entries, 0 to 2191 Data columns (total 28 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 population 2192 non-null float64 1 medhhinc 2192 non-null float64 2 percapita_income 2192 non-null float64 3 poverty_total 2192 non-null float64 4 poverty_below 2192 non-null float64 5 housing_total 2192 non-null float64 6 housing_owner 2192 non-null float64 7 housing_renter 2192 non-null float64 8 nativity_total 2192 non-null float64 9 nativity_foreign 2192 non-null float64 10 hs_grad 2192 non-null float64 11 bachelors_deg 2192 non-null float64 12 masters_deg 2192 non-null float64 13 prof_school_deg 2192 non-null float64 14 doctorate_deg 2192 non-null float64 15 disability_total 2192 non-null float64 16 disability_under18 2192 non-null float64 17 health_ins_total 2192 non-null float64 18 health_ins_m18_24 2192 non-null float64 19 health_ins_f18_24 2192 non-null float64 20 NAME 2192 non-null object 21 state 2192 non-null int64 22 county 2192 non-null int64 23 tract 2192 non-null int64 24 povpctind 2192 non-null float64 25 tenpctown 2192 non-null float64 26 tenpctren 2192 non-null float64 27 natpctfor 2192 non-null float64 dtypes: float64(24), int64(3), object(1) memory usage: 479.6+ KB Preview of the dataset: Column Names: ['population', 'medhhinc', 'percapita_income', 'poverty_total', 'poverty_below', 'housing_total', 'housing_owner', 'housing_renter', 'nativity_total', 'nativity_foreign', 'hs_grad', 'bachelors_deg', 'masters_deg', 'prof_school_deg', 'doctorate_deg', 'disability_total', 'disability_under18', 'health_ins_total', 'health_ins_m18_24', 'health_ins_f18_24', 'NAME', 'state', 'county', 'tract', 'povpctind', 'tenpctown', 'tenpctren', 'natpctfor']
In [74]:
# Print the 'NAME' column
print(census_df['NAME'])
0 Census Tract 2; Bronx County; New York 1 Census Tract 4; Bronx County; New York 2 Census Tract 16; Bronx County; New York 3 Census Tract 19.01; Bronx County; New York 4 Census Tract 19.02; Bronx County; New York ... 2187 Census Tract 303.01; Richmond County; New York 2188 Census Tract 303.02; Richmond County; New York 2189 Census Tract 319.01; Richmond County; New York 2190 Census Tract 319.02; Richmond County; New York 2191 Census Tract 323; Richmond County; New York Name: NAME, Length: 2192, dtype: object
In [75]:
#print county column
print(census_df['county'])
0 5 1 5 2 5 3 5 4 5 .. 2187 85 2188 85 2189 85 2190 85 2191 85 Name: county, Length: 2192, dtype: int64
In [76]:
print(census_df['county'].unique())
[ 5 47 61 81 85]
In [77]:
county_name_map = {
"005": "Bronx County",
"047": "Kings County",
"061": "New York County",
"081": "Queens County",
"085": "Richmond County"
}
# Ensure county codes are strings
merged_df["county"] = merged_df["county"].astype(str).str.zfill(3)
# Map county codes to readable names
merged_df["county_name"] = merged_df["county"].map(county_name_map)
# Preview the new column
print(merged_df[["county", "county_name"]].drop_duplicates())
county county_name 0 005 Bronx County 361 047 Kings County 1166 061 New York County 1476 081 Queens County 2201 085 Richmond County
In [78]:
# Ensure 'NAME' column is treated as a string, replacing NaNs with an empty string
census_df['NAME'] = census_df['NAME'].astype(str).fillna('')
# Extract the census tract from the NAME column (before the semicolon)
census_df['tract_name'] = census_df['NAME'].apply(
lambda x: x.split(";")[0].strip() if ";" in x else ""
)
# Display the first few rows to verify
print(census_df[['NAME', 'tract_name']].head())
NAME tract_name 0 Census Tract 2; Bronx County; New York Census Tract 2 1 Census Tract 4; Bronx County; New York Census Tract 4 2 Census Tract 16; Bronx County; New York Census Tract 16 3 Census Tract 19.01; Bronx County; New York Census Tract 19.01 4 Census Tract 19.02; Bronx County; New York Census Tract 19.02
In [79]:
# Display unique values in the 'state' column to determine the correct format
print("Unique state values in the dataset:")
print(census_df['state'].unique())
Unique state values in the dataset: [36]
In [80]:
import re
# Extract numeric portion from 'tract_name' using regex
census_df['tract_number'] = census_df['tract_name'].apply(
lambda x: re.search(r'\d+(\.\d+)?', x).group() if re.search(r'\d+(\.\d+)?', x) else None
)
# Convert to float for sorting/grouping
census_df['tract_number'] = census_df['tract_number'].astype(float)
print(census_df[['tract_name', 'tract_number']].head())
# Extract borough from county_name if it's available
# (we'll add this only if the column doesn't exist yet)
if 'county_name' not in census_df.columns:
census_df['county_name'] = census_df['NAME'].apply(
lambda x: x.split(";")[1].strip() if ";" in x else ""
)
# Map county_name to borough
borough_map = {
"New York County": "Manhattan",
"Kings County": "Brooklyn",
"Bronx County": "Bronx",
"Queens County": "Queens",
"Richmond County": "Staten Island"
}
census_df['borough'] = census_df['county_name'].map(borough_map)
print(census_df[['county_name', 'borough']].drop_duplicates())
tract_name tract_number 0 Census Tract 2 2.00 1 Census Tract 4 4.00 2 Census Tract 16 16.00 3 Census Tract 19.01 19.01 4 Census Tract 19.02 19.02 county_name borough 0 Bronx County Bronx 334 Kings County Brooklyn 1104 New York County Manhattan 1401 Queens County Queens 2075 Richmond County Staten Island
In [81]:
#print unique county names
print("Number of unique counties:", census_df['county_name'].nunique())
Number of unique counties: 5
In [82]:
print("Available columns in data_df:")
print(data_df.columns)
print(merged_df["tract"])
Available columns in data_df: Index(['population', 'medhhinc', 'percapita_income', 'poverty_total', 'poverty_below', 'housing_total', 'housing_owner', 'housing_renter', 'nativity_total', 'nativity_foreign', 'hs_grad', 'bachelors_deg', 'masters_deg', 'prof_school_deg', 'doctorate_deg', 'disability_total', 'disability_under18', 'health_ins_total', 'health_ins_m18_24', 'health_ins_f18_24', 'NAME', 'state', 'county', 'tract', 'povpctind', 'tenpctown', 'tenpctren', 'natpctfor'], dtype='object') 0 100 1 200 2 400 3 1600 4 1901 ... 2322 30302 2323 31901 2324 31902 2325 32300 2326 990100 Name: tract, Length: 2327, dtype: int64
In [83]:
#taking a look at pluto Data
# Define the file path (adjust if needed)
file_path = "nyc_pluto_24v4_1_csv/pluto_24v4_1.csv"
# Load the PLUTO dataset
pluto_df = pd.read_csv(file_path)
# Display basic info
print("PLUTO Dataset Info:")
pluto_df.info()
# Show the first few rows
pluto_df.head()
/var/folders/7w/1y1zc9c95qnc86tchl3m3yr00000gn/T/ipykernel_9002/2068003549.py:7: DtypeWarning: Columns (21,22,24,26,28,65,66) have mixed types. Specify dtype option on import or set low_memory=False. pluto_df = pd.read_csv(file_path)
PLUTO Dataset Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 858642 entries, 0 to 858641 Data columns (total 92 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 borough 858642 non-null object 1 block 858642 non-null int64 2 lot 858642 non-null int64 3 cd 857534 non-null float64 4 bct2020 857449 non-null float64 5 bctcb2020 857449 non-null float64 6 ct2010 857449 non-null float64 7 cb2010 857449 non-null float64 8 schooldist 856807 non-null float64 9 council 857532 non-null float64 10 zipcode 856795 non-null float64 11 firecomp 856788 non-null object 12 policeprct 856802 non-null float64 13 healthcenterdistrict 856802 non-null float64 14 healtharea 856802 non-null float64 15 sanitboro 856603 non-null float64 16 sanitdistrict 856603 non-null float64 17 sanitsub 856483 non-null object 18 address 858294 non-null object 19 zonedist1 856961 non-null object 20 zonedist2 19654 non-null object 21 zonedist3 214 non-null object 22 zonedist4 12 non-null object 23 overlay1 74482 non-null object 24 overlay2 183 non-null object 25 spdist1 105653 non-null object 26 spdist2 300 non-null object 27 spdist3 0 non-null float64 28 ltdheight 3045 non-null object 29 splitzone 856961 non-null object 30 bldgclass 858297 non-null object 31 landuse 855871 non-null float64 32 easements 858296 non-null float64 33 ownertype 36006 non-null object 34 ownername 857773 non-null object 35 lotarea 857791 non-null float64 36 bldgarea 858256 non-null float64 37 comarea 811661 non-null float64 38 resarea 811661 non-null float64 39 officearea 811661 non-null float64 40 retailarea 811661 non-null float64 41 garagearea 811661 non-null float64 42 strgearea 811661 non-null float64 43 factryarea 811661 non-null float64 44 otherarea 811661 non-null float64 45 areasource 858296 non-null float64 46 numbldgs 857793 non-null float64 47 numfloors 815803 non-null float64 48 unitsres 858146 non-null float64 49 unitstotal 858147 non-null float64 50 lotfront 857791 non-null float64 51 lotdepth 857791 non-null float64 52 bldgfront 857791 non-null float64 53 bldgdepth 857791 non-null float64 54 ext 795788 non-null object 55 proxcode 858296 non-null float64 56 irrlotcode 858296 non-null object 57 lottype 858296 non-null float64 58 bsmtcode 858296 non-null float64 59 assessland 858296 non-null float64 60 assesstot 858296 non-null float64 61 exempttot 858296 non-null float64 62 yearbuilt 858296 non-null float64 63 yearalter1 858296 non-null float64 64 yearalter2 858296 non-null float64 65 histdist 32536 non-null object 66 landmark 1456 non-null object 67 builtfar 857414 non-null float64 68 residfar 858642 non-null float64 69 commfar 858642 non-null float64 70 facilfar 858642 non-null float64 71 borocode 858642 non-null int64 72 bbl 858642 non-null float64 73 condono 10441 non-null float64 74 tract2010 857449 non-null float64 75 xcoord 857421 non-null float64 76 ycoord 857421 non-null float64 77 zonemap 857021 non-null object 78 zmcode 15636 non-null object 79 sanborn 856927 non-null object 80 taxmap 856927 non-null float64 81 edesignum 10049 non-null object 82 appbbl 96703 non-null float64 83 appdate 96703 non-null object 84 plutomapid 858642 non-null int64 85 firm07_flag 34677 non-null float64 86 pfirm15_flag 65750 non-null float64 87 version 858642 non-null object 88 dcpedited 42097 non-null object 89 latitude 857421 non-null float64 90 longitude 857421 non-null float64 91 notes 0 non-null float64 dtypes: float64(60), int64(4), object(28) memory usage: 602.7+ MB
Out[83]:
borough | block | lot | cd | bct2020 | bctcb2020 | ct2010 | cb2010 | schooldist | council | ... | appbbl | appdate | plutomapid | firm07_flag | pfirm15_flag | version | dcpedited | latitude | longitude | notes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BK | 7604 | 47 | 314.0 | 3075200.0 | 3.075200e+10 | 752.0 | 1004.0 | 22.0 | 45.0 | ... | NaN | NaN | 1 | NaN | NaN | 24v4.1 | NaN | 40.625779 | -73.953658 | NaN |
1 | BK | 1304 | 1 | 309.0 | 3032300.0 | 3.032300e+10 | 323.0 | 3002.0 | 17.0 | 35.0 | ... | 3.013040e+09 | 10/20/1989 | 1 | NaN | NaN | 24v4.1 | NaN | 40.664671 | -73.956901 | NaN |
2 | BK | 1304 | 58 | 309.0 | 3032300.0 | 3.032300e+10 | 323.0 | 3001.0 | 17.0 | 35.0 | ... | NaN | NaN | 1 | NaN | NaN | 24v4.1 | NaN | 40.665104 | -73.955675 | NaN |
3 | BK | 1294 | 1 | 309.0 | 3032500.0 | 3.032500e+10 | 325.0 | 3000.0 | 17.0 | 35.0 | ... | 3.012940e+09 | 02/27/1989 | 1 | NaN | NaN | 24v4.1 | NaN | 40.666423 | -73.958074 | NaN |
4 | BK | 1302 | 1 | 309.0 | 3032500.0 | 3.032500e+10 | 325.0 | 3001.0 | 17.0 | 35.0 | ... | NaN | NaN | 1 | NaN | NaN | 24v4.1 | NaN | 40.665062 | -73.958101 | NaN |
5 rows × 92 columns
In [84]:
##in this section we are looking to see if there are ways to join the pluto and census data
#print bct (from pluto)
print(pluto_df["bct2020"])
#print tract from census
print(merged_df["tract"])
0 3075200.0 1 3032300.0 2 3032300.0 3 3032500.0 4 3032500.0 ... 858637 5020803.0 858638 2033700.0 858639 3097000.0 858640 2041400.0 858641 3029000.0 Name: bct2020, Length: 858642, dtype: float64 0 100 1 200 2 400 3 1600 4 1901 ... 2322 30302 2323 31901 2324 31902 2325 32300 2326 990100 Name: tract, Length: 2327, dtype: int64
In [87]:
# Convert both columns to string and sets
pluto_tracts = set(pluto_df["bct2020"].astype(str))
census_tracts = set(merged_df["tract"].astype(str))
In [88]:
# Find intersection
matching_tracts = pluto_tracts.intersection(census_tracts)
# Print results
print(f"✅ Matching tracts: {len(matching_tracts)}")
print("Sample matches:", list(matching_tracts)[:10]) # Show first 10 matches
✅ Matching tracts: 0 Sample matches: []
In [89]:
#it looks like there are not matches because the data is in different formats
#we need to remove the last three digits, including the decimal point from the bct2020 column and try again
In [90]:
# Step 1: Convert to string
pluto_df["clean_bct"] = pluto_df["bct2020"].astype(str)
# Step 2: Remove trailing '00.0' from the end
pluto_df["clean_bct"] = pluto_df["clean_bct"].str.replace("00.0$", "", regex=True)
# Preview
print(pluto_df[["bct2020", "clean_bct"]].head())
bct2020 clean_bct 0 3075200.0 30752 1 3032300.0 30323 2 3032300.0 30323 3 3032500.0 30325 4 3032500.0 30325
In [91]:
#lets make a new column to hold these new values: # Step 1: Convert to string
pluto_df["clean_bct"] = pluto_df["bct2020"].astype(str)
# Step 2: Remove trailing '00.0'
pluto_df["clean_bct"] = pluto_df["clean_bct"].str.replace("00.0$", "", regex=True)
# Step 3: Ensure it's zero-padded to 6 digits (optional, but recommended for matching)
pluto_df["clean_bct"] = pluto_df["clean_bct"].str.zfill(6)
# ✅ Preview cleaned values
print(pluto_df[["bct2020", "clean_bct"]].head())
bct2020 clean_bct 0 3075200.0 030752 1 3032300.0 030323 2 3032300.0 030323 3 3032500.0 030325 4 3032500.0 030325
In [92]:
# Convert to sets of strings for comparison
pluto_tracts = set(pluto_df["clean_bct"])
census_tracts = set(merged_df["tract"].astype(str).str.zfill(6))
# Find intersection
matching_tracts = pluto_tracts.intersection(census_tracts)
# Print results
print(f"✅ Matching tracts: {len(matching_tracts)}")
print("Sample matches:", list(matching_tracts)[:10])
✅ Matching tracts: 25 Sample matches: ['030900', '050201', '020200', '031200', '020300', '030600', '030301', '040100', '030300', '040101']
In [93]:
#merge data and export to look for possible cluster analysis
merged_df["tract_str"] = merged_df["tract"].astype(str).str.zfill(6)
# Perform the inner merge to keep only matching tracts
merged_data = pd.merge(
pluto_df,
merged_df,
left_on="clean_bct",
right_on="tract_str",
how="inner"
)
# ✅ Save to CSV
merged_data.to_csv("merged_pluto_census_matched.csv", index=False)
print(f"✅ Merged dataset saved with {merged_data.shape[0]} matching rows.")
✅ Merged dataset saved with 18917 matching rows.
In [66]:
# Step 1: Group PLUTO data by bct2020 and take average of numeric fields
pluto_tract_df = pluto_df.groupby("bct2020").mean(numeric_only=True).reset_index()
# Step 2: Clean bct2020 for joining
pluto_tract_df["clean_bct"] = (
pluto_tract_df["bct2020"]
.astype(str)
.str.replace("00.0$", "", regex=True)
.str.zfill(6)
)
# Step 3: Prepare census tract column
merged_df["tract_str"] = merged_df["tract"].astype(str).str.zfill(6)
# Step 4: Merge on matching tracts
merged_data = pd.merge(
pluto_tract_df,
merged_df,
left_on="clean_bct",
right_on="tract_str",
how="inner"
)
# Step 5: Export
merged_data.to_csv("merged_pluto_census_by_tract.csv", index=False)
print(f"✅ Exported merged data with shape: {merged_data.shape}")
✅ Exported merged data with shape: (42, 91)
In [96]:
# what we are trying to do with the following cluster analysis:
''' Goal of Clustering
• Which neighborhoods are demographically or socioeconomically similar?
• Are there clusters of high-need areas that share poverty + low education + health disparities?
• Which tracts might be more vulnerable or more affluent?
• Can we identify “types” of tracts — like high-density, low-income, high-disability areas? '''
Out[96]:
' Goal of Clustering \n\t•\t Which neighborhoods are demographically or socioeconomically similar?\n\t•\tAre there clusters of high-need areas that share poverty + low education + health disparities?\n\t•\tWhich tracts might be more vulnerable or more affluent?\n\t•\tCan we identify “types” of tracts — like high-density, low-income, high-disability areas? '
In [135]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Load your merged dataset
df = pd.read_csv("merged_pluto_census_by_tract.csv")
# -------------------------------
# 🎯 Step 1: Select & Engineer Features
# -------------------------------
df["poverty_rate"] = df["poverty_below"] / df["poverty_total"]
df["renter_rate"] = df["housing_renter"] / df["housing_total"]
features = [
"population",
"housing_total",
"renter_rate",
"medhhinc",
"percapita_income",
"poverty_rate",
"hs_grad",
"bachelors_deg",
"disability_total",
"disability_under18",
"health_ins_total"
]
data = df[features].dropna()
# -------------------------------
# 🔍 Step 2: Standardize Data
# -------------------------------
scaler = StandardScaler()
scaled = scaler.fit_transform(data)
# -------------------------------
# 📊 Step 3: PCA for Visualization
# -------------------------------
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled)
pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2"])
# -------------------------------
# 🔀 Step 4: KMeans Clustering
# -------------------------------
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(scaled)
pca_df["Cluster"] = clusters
# -------------------------------
# 🖼️ Step 5: Plot Clusters
# -------------------------------
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Cluster", palette="tab10")
plt.title("Census Tract Clusters (PCA)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.grid(True)
plt.tight_layout()
plt.show()
In [148]:
# Save the cluster assignments to the original df
df["Cluster"] = clusters # Ensure cluster assignments are added to the original dataframe
In [136]:
'''The scatter plot presented is a visual summary of the cluster analysis performed on New York City census tracts.
Each point on the plot represents a single tract, with its position determined through Principal Component Analysis (PCA),
which reduces a set of eleven socio-economic and demographic variables—such as median household income, renter rates, education levels,
and disability counts—into two principal components. These components, while not directly interpretable, capture the greatest variation
across all tracts, allowing for easier visualization of high-dimensional data.
The colors assigned to each point indicate cluster membership as defined by the KMeans clustering algorithm.
In this case, the data has been segmented into four distinct clusters. One cluster, shown in orange (Cluster 1),
appears densely packed in the center of the plot, suggesting a group of census tracts with similar characteristics.
In contrast, other clusters—such as those shown in blue, green, and red—are more sparsely distributed or located at the edges,
indicating that those tracts differ significantly from the central group in terms of their socio-economic profiles.
This PCA plot provides a high-level overview of how census tracts are grouped based on shared attributes. Outliers,
such as the red point located far to the right or the blue points positioned below the central cluster, represent tracts that
are particularly distinct in the dataset. To further interpret the meaning of each cluster, the next step involves analyzing the average
values of the selected variables within each group. This allows for the identification of defining traits—such as high poverty rates,
low educational attainment, or elevated renter populations—and supports more informed mapping and policy analysis across different
neighborhoods in the city.'''
Out[136]:
'The scatter plot presented is a visual summary of the cluster analysis performed on New York City census tracts. \nEach point on the plot represents a single tract, with its position determined through Principal Component Analysis (PCA), \nwhich reduces a set of eleven socio-economic and demographic variables—such as median household income, renter rates, education levels,\nand disability counts—into two principal components. These components, while not directly interpretable, capture the greatest variation \nacross all tracts, allowing for easier visualization of high-dimensional data.\n\nThe colors assigned to each point indicate cluster membership as defined by the KMeans clustering algorithm. \nIn this case, the data has been segmented into four distinct clusters. One cluster, shown in orange (Cluster 1), \nappears densely packed in the center of the plot, suggesting a group of census tracts with similar characteristics. \nIn contrast, other clusters—such as those shown in blue, green, and red—are more sparsely distributed or located at the edges, \nindicating that those tracts differ significantly from the central group in terms of their socio-economic profiles.\n\nThis PCA plot provides a high-level overview of how census tracts are grouped based on shared attributes. Outliers, \nsuch as the red point located far to the right or the blue points positioned below the central cluster, represent tracts that\nare particularly distinct in the dataset. To further interpret the meaning of each cluster, the next step involves analyzing the average \nvalues of the selected variables within each group. This allows for the identification of defining traits—such as high poverty rates, \nlow educational attainment, or elevated renter populations—and supports more informed mapping and policy analysis across different\nneighborhoods in the city.'
In [137]:
#The table shows the mean values of each variable (like income, poverty, education, etc.) for the tracts in each cluster.
#By comparing these averages across clusters, we can spot patterns and differences that help describe the character of each group.
df.groupby("Cluster")[features].mean()
Out[137]:
population | housing_total | renter_rate | medhhinc | percapita_income | poverty_rate | hs_grad | bachelors_deg | disability_total | disability_under18 | health_ins_total | |
---|---|---|---|---|---|---|---|---|---|---|---|
Cluster | |||||||||||
0.0 | 5736.571429 | 2119.428571 | 0.501938 | 80901.714286 | 38254.428571 | 0.128852 | 1088.142857 | 446.142857 | 5724.000000 | 0.0 | 5724.000000 |
1.0 | 3094.774194 | 1075.806452 | 0.609677 | 84023.645161 | 39811.709677 | 0.146679 | 491.548387 | 147.870968 | 3051.774194 | 0.0 | 3051.774194 |
2.0 | 1383.666667 | 714.666667 | 0.757479 | 124717.666667 | 115156.333333 | 0.103999 | 77.333333 | 26.333333 | 1382.000000 | 0.0 | 1382.000000 |
3.0 | 10529.000000 | 3585.000000 | 0.949233 | 44217.000000 | 19785.000000 | 0.386551 | 1041.000000 | 638.000000 | 10529.000000 | 0.0 | 10529.000000 |
In [138]:
'''The table presents the average socio-economic characteristics of each cluster, offering insight into the distinct profiles of the census tracts grouped by the analysis. Cluster 0 represents middle-income neighborhoods with a balanced mix of renters and homeowners, moderate educational attainment, and a sizable population with disabilities. Cluster 1 includes working-class tracts with moderate income and higher renter rates, slightly elevated poverty levels, and lower levels of educational attainment. Cluster 2 stands out as a high-income, low-density group with the highest median and per capita income, but surprisingly low counts of high school and bachelor’s degree holders, suggesting either small populations or unique demographic compositions. Cluster 3 comprises high-density, low-income areas with the highest renter rates, poverty levels, and disability counts—characteristics often associated with communities experiencing concentrated disadvantage. Together, these clusters reveal significant variation in housing, income, education, and health across New York City census tracts, providing a foundation for spatial analysis and policy targeting.'''
Out[138]:
'The table presents the average socio-economic characteristics of each cluster, offering insight into the distinct profiles of the census tracts grouped by the analysis. Cluster 0 represents middle-income neighborhoods with a balanced mix of renters and homeowners, moderate educational attainment, and a sizable population with disabilities. Cluster 1 includes working-class tracts with moderate income and higher renter rates, slightly elevated poverty levels, and lower levels of educational attainment. Cluster 2 stands out as a high-income, low-density group with the highest median and per capita income, but surprisingly low counts of high school and bachelor’s degree holders, suggesting either small populations or unique demographic compositions. Cluster 3 comprises high-density, low-income areas with the highest renter rates, poverty levels, and disability counts—characteristics often associated with communities experiencing concentrated disadvantage. Together, these clusters reveal significant variation in housing, income, education, and health across New York City census tracts, providing a foundation for spatial analysis and policy targeting.'
In [139]:
#Compare the average value of each variable across clusters
cluster_means = df.groupby("Cluster")[features].mean().T
cluster_means.plot(kind="bar", figsize=(14, 6), rot=45)
plt.title("Cluster Profiles Across Variables")
plt.ylabel("Mean Value")
plt.tight_layout()
plt.show()
In [140]:
#Show the distribution (spread + outliers) of variables like income or renter rate per cluster
import seaborn as sns
sns.boxplot(data=df, x="Cluster", y="medhhinc")
plt.title("Median Household Income by Cluster")
plt.show()
In [141]:
#Show all cluster averages as a color-coded heatmap
import seaborn as sns
sns.heatmap(cluster_means, annot=True, cmap="coolwarm")
plt.title("Cluster Variable Averages (Heatmap)")
plt.show()
In [142]:
'''The visualizations provide a clear and compelling view of the socio-economic differences among the identified clusters. The bar chart comparing cluster profiles highlights stark contrasts, with Cluster 2 emerging as the wealthiest group, exhibiting the highest median and per capita incomes, while Cluster 3 stands out as the most disadvantaged, with the highest renter rates, poverty levels, and disability counts, alongside the lowest income figures. Cluster 0 appears to represent a moderate-income group with relatively high educational attainment, whereas Cluster 1 shows a similar income profile but with a higher proportion of renters and lower education levels. The boxplot of median household income further illustrates the distribution within each cluster, confirming that Cluster 2 contains consistently high-income tracts, while Cluster 3 is both the lowest in income and the least economically diverse. Finally, the heatmap of cluster means reinforces these patterns by visually emphasizing variables that distinguish each group. Cluster 2 consistently scores high in income and low in poverty, Cluster 3 inverts this trend, and Clusters 0 and 1 display more balanced or transitional characteristics. Together, these charts provide strong evidence of socio-spatial inequality and offer a data-driven foundation for further mapping and analysis.'''
Out[142]:
'The visualizations provide a clear and compelling view of the socio-economic differences among the identified clusters. The bar chart comparing cluster profiles highlights stark contrasts, with Cluster 2 emerging as the wealthiest group, exhibiting the highest median and per capita incomes, while Cluster 3 stands out as the most disadvantaged, with the highest renter rates, poverty levels, and disability counts, alongside the lowest income figures. Cluster 0 appears to represent a moderate-income group with relatively high educational attainment, whereas Cluster 1 shows a similar income profile but with a higher proportion of renters and lower education levels. The boxplot of median household income further illustrates the distribution within each cluster, confirming that Cluster 2 contains consistently high-income tracts, while Cluster 3 is both the lowest in income and the least economically diverse. Finally, the heatmap of cluster means reinforces these patterns by visually emphasizing variables that distinguish each group. Cluster 2 consistently scores high in income and low in poverty, Cluster 3 inverts this trend, and Clusters 0 and 1 display more balanced or transitional characteristics. Together, these charts provide strong evidence of socio-spatial inequality and offer a data-driven foundation for further mapping and analysis.'
In [143]:
import pandas as pd
import folium
from IPython.display import display
# Step 1: Load the CSV with cluster data
df = pd.read_csv("clustered_census_data.csv") # Use your actual CSV file path
# Step 2: Check the structure of the CSV (ensure lat, lon, and cluster columns are present)
print(df.head())
# Step 3: Initialize a base Folium map centered on NYC
m = folium.Map(location=[40.7128, -74.0060], zoom_start=11)
# Step 4: Add markers for each row in the CSV
for _, row in df.iterrows():
folium.CircleMarker(
location=[row["latitude"], row["longitude"]], # Assuming lat/lon columns
radius=5,
color="blue" if row["Cluster"] == 0 else
"orange" if row["Cluster"] == 1 else
"green" if row["Cluster"] == 2 else
"red", # Color based on cluster
fill=True,
fill_color="blue" if row["Cluster"] == 0 else
"orange" if row["Cluster"] == 1 else
"green" if row["Cluster"] == 2 else
"red",
fill_opacity=0.7,
).add_to(m)
# Step 5: Display the map in Jupyter or save it as HTML for use
display(m) # For Jupyter/Colab
m.save("nyc_clusters_map.html") # For a standalone map
bct2020 block lot cd bctcb2020 ct2010 \ 0 1010000.0 1307.977528 1229.640449 105.617978 1.010000e+10 100.0 1 1010000.0 1307.977528 1229.640449 105.617978 1.010000e+10 100.0 2 1010000.0 1307.977528 1229.640449 105.617978 1.010000e+10 100.0 3 1020000.0 1720.403974 326.062914 110.000000 1.020000e+10 200.0 4 1020000.0 1720.403974 326.062914 110.000000 1.020000e+10 200.0 cb2010 schooldist council zipcode ... health_ins_f18_24 \ 0 1620.000000 2.000000 4.0 10023.123596 ... 0.0 1 1620.000000 2.000000 4.0 10023.123596 ... 0.0 2 1620.000000 2.000000 4.0 10023.123596 ... 0.0 3 1610.804636 4.019868 9.0 10027.357616 ... 0.0 4 1610.804636 4.019868 9.0 10027.357616 ... 0.0 NAME state county tract \ 0 Census Tract 101; Kings County; New York 36 47 10100 1 Census Tract 101; New York County; New York 36 61 10100 2 Census Tract 101; Queens County; New York 36 81 10100 3 Census Tract 102; Kings County; New York 36 47 10200 4 Census Tract 102; New York County; New York 36 61 10200 county_name tract_str poverty_rate renter_rate Cluster 0 Kings County 10100 0.161742 0.693609 1.0 1 New York County 10100 0.160166 1.000000 2.0 2 Queens County 10100 0.045193 0.683664 1.0 3 Kings County 10200 0.212007 0.632509 1.0 4 New York County 10200 0.069672 0.591195 2.0 [5 rows x 94 columns]
Make this Notebook Trusted to load map: File -> Trust Notebook
In [144]:
'The map visualizes the distribution of different clusters across New York City and its surroundings, with each cluster represented by distinct colored markers: blue for Cluster 0, orange for Cluster 1, and red for Cluster 2. The clusters are spread across various geographic areas, with Cluster 0 showing a more scattered distribution, potentially indicating areas with high income or low density. Cluster 1 appears more concentrated, likely representing areas with moderate income or a higher proportion of renters. Cluster 2, marked in red, highlights areas with more extreme socio-economic factors, such as high poverty and disability rates. The map offers a geographic insight into how these clusters align with urban, residential, or low-income areas.'''
Out[144]:
'The map visualizes the distribution of different clusters across New York City and its surroundings, with each cluster represented by distinct colored markers: blue for Cluster 0, orange for Cluster 1, and red for Cluster 2. The clusters are spread across various geographic areas, with Cluster 0 showing a more scattered distribution, potentially indicating areas with high income or low density. Cluster 1 appears more concentrated, likely representing areas with moderate income or a higher proportion of renters. Cluster 2, marked in red, highlights areas with more extreme socio-economic factors, such as high poverty and disability rates. The map offers a geographic insight into how these clusters align with urban, residential, or low-income areas.'
In [149]:
print(df.columns)
Index(['bct2020', 'block', 'lot', 'cd', 'bctcb2020', 'ct2010', 'cb2010', 'schooldist', 'council', 'zipcode', 'policeprct', 'healthcenterdistrict', 'healtharea', 'sanitboro', 'sanitdistrict', 'spdist3', 'landuse', 'easements', 'lotarea', 'bldgarea', 'comarea', 'resarea', 'officearea', 'retailarea', 'garagearea', 'strgearea', 'factryarea', 'otherarea', 'areasource', 'numbldgs', 'numfloors', 'unitsres', 'unitstotal', 'lotfront', 'lotdepth', 'bldgfront', 'bldgdepth', 'proxcode', 'lottype', 'bsmtcode', 'assessland', 'assesstot', 'exempttot', 'yearbuilt', 'yearalter1', 'yearalter2', 'builtfar', 'residfar', 'commfar', 'facilfar', 'borocode', 'bbl', 'condono', 'tract2010', 'xcoord', 'ycoord', 'taxmap', 'appbbl', 'plutomapid', 'firm07_flag', 'pfirm15_flag', 'latitude', 'longitude', 'notes', 'clean_bct', 'population', 'medhhinc', 'percapita_income', 'poverty_total', 'poverty_below', 'housing_total', 'housing_owner', 'housing_renter', 'nativity_total', 'nativity_foreign', 'hs_grad', 'bachelors_deg', 'masters_deg', 'prof_school_deg', 'doctorate_deg', 'disability_total', 'disability_under18', 'health_ins_total', 'health_ins_m18_24', 'health_ins_f18_24', 'NAME', 'state', 'county', 'tract', 'county_name', 'tract_str', 'poverty_rate', 'renter_rate', 'Cluster'], dtype='object')
In [155]:
# Check the distribution of clusters in the dataset
print(df["Cluster"].value_counts())
Cluster 1 31 0 7 2 3 3 1 Name: count, dtype: int64
In [151]:
#experiment: predict the cluster a census tract will fall into based on its socio-econ features
from sklearn.ensemble import RandomForestClassifier
# Define the features and target
X = df[["poverty_below", "housing_renter", "bachelors_deg",'health_ins_total']]
y = df["Cluster"] # Target variable (cluster labels)
# Initialize the model and fit it
model = RandomForestClassifier()
model.fit(X, y)
# Predict the cluster for new data
predictions = model.predict(X) # Or new data, if available
print(predictions)
[1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 1 1 1 3 1 1 1 1 1 1 1 1 0 1 1 1 1]
In [153]:
#evaluate the model
from sklearn.metrics import accuracy_score, classification_report
# Evaluate accuracy
accuracy = accuracy_score(y, predictions)
print(f"Accuracy: {accuracy}")
# Detailed classification report
print(classification_report(y, predictions))
Accuracy: 1.0 precision recall f1-score support 0 1.00 1.00 1.00 7 1 1.00 1.00 1.00 31 2 1.00 1.00 1.00 3 3 1.00 1.00 1.00 1 accuracy 1.00 42 macro avg 1.00 1.00 1.00 42 weighted avg 1.00 1.00 1.00 42
In [154]:
# Get feature importance
feature_importance = model.feature_importances_
# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
'Feature': X.columns,
'Importance': feature_importance
})
# Sort features by importance
importance_df = importance_df.sort_values(by="Importance", ascending=False)
print(importance_df)
Feature Importance 2 bachelors_deg 0.446122 3 health_ins_total 0.355072 1 housing_renter 0.122080 0 poverty_below 0.076726
In [152]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# Fit a KMeans model to your data (use the same features as above)
kmeans = KMeans(n_clusters=4, random_state=42) # Adjust n_clusters if needed
kmeans.fit(X)
# Calculate silhouette score to measure the quality of clusters
sil_score = silhouette_score(X, kmeans.labels_)
print(f"Silhouette Score: {sil_score}")
Silhouette Score: 0.481637533817962
In [158]:
'''#This indicates that the clusters are moderately well-defined, but not perfectly. A higher silhouette score closer to 1 would
#indicate better separation between clusters. A score around 0.48 suggests that while the clusters are somewhat distinct,
#there’s room for improvement in the clustering process.
implications for Cluster Analysis:
Class Imbalance: The imbalance in cluster distribution could have affected the model’s ability to predict less frequent clusters
(Clusters 2 and 3). You might want to consider balancing the data or using techniques like class weighting or sampling to improve
predictions for these underrepresented clusters.
Overfitting Concern: The model’s perfect accuracy could suggest overfitting to the training data. If you apply the model to new data,
the performance might not be as good. Evaluating it on test data (or using cross-validation) will provide a clearer picture of its
generalization ability.
Feature Importance: The model heavily relies on features like education and health insurance, which is useful information if
you’re interpreting socio-economic trends. It might be worth investigating why these variables are so important in distinguishing clusters.
Cluster Separation: The silhouette score shows that while your clusters have some degree of separation, the model could be
improved by tuning parameters like the number of clusters or by exploring different clustering techniques (e.g., hierarchical clustering).'''
Out[158]:
'#This indicates that the clusters are moderately well-defined, but not perfectly. A higher silhouette score closer to 1 would\n#indicate better separation between clusters. A score around 0.48 suggests that while the clusters are somewhat distinct, \n#there’s room for improvement in the clustering process.\n\nimplications for Cluster Analysis:\n\nClass Imbalance: The imbalance in cluster distribution could have affected the model’s ability to predict less frequent clusters \n(Clusters 2 and 3). You might want to consider balancing the data or using techniques like class weighting or sampling to improve \npredictions for these underrepresented clusters.\n\nOverfitting Concern: The model’s perfect accuracy could suggest overfitting to the training data. If you apply the model to new data,\nthe performance might not be as good. Evaluating it on test data (or using cross-validation) will provide a clearer picture of its \ngeneralization ability.\n\nFeature Importance: The model heavily relies on features like education and health insurance, which is useful information if\nyou’re interpreting socio-economic trends. It might be worth investigating why these variables are so important in distinguishing clusters.\n\nCluster Separation: The silhouette score shows that while your clusters have some degree of separation, the model could be \nimproved by tuning parameters like the number of clusters or by exploring different clustering techniques (e.g., hierarchical clustering).'
In [160]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
# Define the features
X = df[["poverty_below", "housing_renter", "bachelors_deg", "health_ins_total"]] # Adjust features if necessary
# Range of clusters to test (e.g., 2 to 10 clusters)
cluster_range = range(2, 11)
# List to store silhouette scores for each number of clusters
sil_scores = []
# Loop through different cluster numbers
for n_clusters in cluster_range:
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)
# Calculate the silhouette score for each clustering result
score = silhouette_score(X, kmeans.labels_)
sil_scores.append(score)
# Plot silhouette scores for different numbers of clusters
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, sil_scores, marker='o', linestyle='--')
plt.title("Silhouette Score for Different Numbers of Clusters")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.tight_layout()
plt.show()
# Print the silhouette scores for each cluster count
for n_clusters, score in zip(cluster_range, sil_scores):
print(f"Clusters: {n_clusters}, Silhouette Score: {score}")
Clusters: 2, Silhouette Score: 0.5022058123253923 Clusters: 3, Silhouette Score: 0.47293931184825605 Clusters: 4, Silhouette Score: 0.481637533817962 Clusters: 5, Silhouette Score: 0.3986128988890994 Clusters: 6, Silhouette Score: 0.39090080881248856 Clusters: 7, Silhouette Score: 0.4045111243251314 Clusters: 8, Silhouette Score: 0.4020455840111953 Clusters: 9, Silhouette Score: 0.3908044035886829 Clusters: 10, Silhouette Score: 0.37627035168471223
In [161]:
'''the above:
• tests different cluster numbers from 2 to 10.
• calculates the silhouette score for each cluster number.
• visualizes the silhouette scores to help us choose the best number of clusters based on the highest score.'''
Out[161]:
'the above: \n\t•\tIt tests different cluster numbers from 2 to 10.\n\t•\tIt calculates the silhouette score for each cluster number.\n\t•\tIt visualizes the silhouette scores to help us choose the best number of clusters based on the highest score.'
In [164]:
from sklearn.cluster import DBSCAN
# Initialize DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)
# Visualize the results (using PCA for 2D visualization)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_labels, cmap='viridis', marker='o')
plt.title("DBSCAN Clustering")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()
'''DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm that identifies clusters based on the density of points in the feature space. Unlike KMeans, which requires specifying the number of clusters beforehand, DBSCAN has two main parameters that guide the clustering process:'''
Out[164]:
'DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm that identifies clusters based on the density of points in the feature space. Unlike KMeans, which requires specifying the number of clusters beforehand, DBSCAN has two main parameters that guide the clustering process:'
In [171]:
from sklearn.cluster import AgglomerativeClustering
# Initialize AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=4) # You can change the number of clusters
agglo_labels = agglo.fit_predict(X)
# Visualize the results (using PCA for 2D visualization)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=agglo_labels, cmap='viridis', marker='o')
plt.title("Agglomerative Clustering")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()
In [172]:
#lets redo our analysis by using the algorithmic clustering
# Calculate the mean of each feature for each cluster
cluster_means = df.groupby("Cluster")[["poverty_below", "housing_renter", "bachelors_deg", "health_ins_total", "population", "housing_total", "renter_rate", "medhhinc", "percapita_income"]].mean()
# Print the cluster means to interpret them
print(cluster_means)
poverty_below housing_renter bachelors_deg health_ins_total \ Cluster 0 753.571429 1161.142857 446.142857 5724.000000 1 470.935484 694.193548 147.870968 3051.774194 2 162.666667 600.666667 26.333333 1382.000000 3 4070.000000 3403.000000 638.000000 10529.000000 population housing_total renter_rate medhhinc \ Cluster 0 5736.571429 2119.428571 0.501938 80901.714286 1 3094.774194 1075.806452 0.609677 84023.645161 2 1383.666667 714.666667 0.757479 124717.666667 3 10529.000000 3585.000000 0.949233 44217.000000 percapita_income Cluster 0 38254.428571 1 39811.709677 2 115156.333333 3 19785.000000
In [173]:
import matplotlib.pyplot as plt
import seaborn as sns
# Plot a bar chart for each feature
plt.figure(figsize=(14, 10))
# Plot the average values of each feature across clusters
cluster_means.plot(kind="bar", figsize=(14, 8), colormap="Set3")
plt.title("Average Values of Each Feature Across Clusters")
plt.xlabel("Cluster")
plt.ylabel("Average Value")
plt.grid(True)
plt.tight_layout()
plt.show()
<Figure size 1400x1000 with 0 Axes>
In [174]:
# Create a box plot to show the distribution of each feature across clusters
features = ["poverty_below", "housing_renter", "bachelors_deg", "health_ins_total", "population", "housing_total", "renter_rate", "medhhinc", "percapita_income"]
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features, 1):
plt.subplot(3, 3, i) # Arrange the boxplots in a 3x3 grid
sns.boxplot(x="Cluster", y=feature, data=df)
plt.title(f"Boxplot of {feature} by Cluster")
plt.tight_layout()
plt.show()
In [175]:
import seaborn as sns
# Plot a heatmap to show cluster averages
plt.figure(figsize=(10, 8))
sns.heatmap(cluster_means, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
plt.title("Cluster Variable Averages (Heatmap)")
plt.tight_layout()
plt.show()
In [178]:
import folium
import pandas as pd
# Assuming 'df' has latitude and longitude columns, and 'Cluster' for clustering information
# Create a base map centered on NYC
m = folium.Map(location=[40.7128, -74.0060], zoom_start=11)
# Color map for the clusters
color_map = {0: 'blue', 1: 'green', 2: 'red', 3: 'purple'}
# Plot the clusters on the map using CircleMarkers
for i, row in df.iterrows():
folium.CircleMarker(
location=[row["latitude"], row["longitude"]], # Ensure you have latitude and longitude columns
radius=5,
color=color_map[row["Cluster"]], # Color based on the cluster label
fill=True,
fill_color=color_map[row["Cluster"]],
fill_opacity=0.7
).add_to(m)
# Save the map to an HTML file
m.save("nyc_clusters_map.html")
# Display the map (for Jupyter/Colab)
display(m)
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
''' • Cluster 0: Represents areas with moderate to high poverty, moderate rent, and middle-income neighborhoods.
• Cluster 1: Areas with moderate socio-economic conditions, both in terms of poverty and housing rent, with middle-income levels.
• Cluster 2: Affluent neighborhoods with low poverty and high incomes, but with low education and health insurance coverage.
• Cluster 3: Low-income, large population areas with high poverty and high renter rates, but low income overall.'''
In [ ]: