In [67]:
# Essential import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# this is sometimes helpful to suppress scientific notation in the display of arrays
np.set_printoptions(suppress=True)
In [68]:
#import Census package
from census import Census
import time
In [ ]:
#first we are going to make the census API and get the information we want 
In [36]:
# pulling ACS data 
API_KEY = "KEY"
c = Census(API_KEY, year=2023)

variables = [
    "NAME",
    # Education Variables
    "B15003_001E", "B15003_017E", "B15003_018E", "B15003_019E", "B15003_020E",
    "B15003_021E", "B15003_022E", "B15003_023E", "B15003_024E", "B14001_001E",
    "B14001_002E", "B14001_008E", "B14002_001E", "B14002_003E", "B14002_004E",
    "B14007_001E", "B14007_003E",  
    # Income Variables
    "B19013_001E", "B19301_001E", "B17021_001E", "B17021_002E",
    # Health Variables
    "B18135_001E", "B18135_007E", "B27001_001E", "B27001_005E", "B27001_033E",
    # Immigration Variables
    "B05002_001E", "B05002_013E", "B05006_001E", "B05006_005E",
    # NEW: Population and Housing
    "B02001_001E", "B25003_001E", "B25003_002E", "B25003_003E",
]

# Function to split variable list into chunks of 40 (to avoid API limits)
def chunk_list(lst, size=40):
    for i in range(0, len(lst), size):
        yield lst[i:i + size]

# Safe request function
def safe_census_request(census_obj, fields, geo, retries=3):
    for attempt in range(retries):
        try:
            return census_obj.acs5.get(fields, geo)
        except Exception as e:
            print(f"API request failed (Attempt {attempt+1}/{retries}): {e}")
            time.sleep(5)
    return None

# Pull data for all 5 boroughs
nyc_counties = ["005", "047", "061", "081", "085"]
dfs = []

for county_code in nyc_counties:
    print(f"🔄 Pulling data for county: {county_code}...")
    
    county_dfs = []
    for chunk in chunk_list(variables, size=40):
        geo_filter = {"for": "tract:*", "in": f"state:36 county:{county_code}"}
        data = safe_census_request(c, chunk, geo_filter)
        
        if data:
            df_chunk = pd.DataFrame(data)
            county_dfs.append(df_chunk)
        else:
            print(f"⚠️ Skipping chunk for county {county_code} due to API failure.")
    
    if county_dfs:
        county_df = pd.concat(county_dfs, axis=1)
        county_df = county_df.loc[:, ~county_df.columns.duplicated()]
        dfs.append(county_df)

# ✅ Combine all counties into one big DataFrame
if dfs:
    merged_df = pd.concat(dfs, axis=0)
    merged_df.to_csv("census_data_ny_2023.csv", index=False)
    print("✅ Full NYC data saved as 'census_data_ny_2023.csv'")
else:
    print("🚫 No data retrieved. Check API errors.")
🔄 Pulling data for county: 005...
🔄 Pulling data for county: 047...
🔄 Pulling data for county: 061...
🔄 Pulling data for county: 081...
🔄 Pulling data for county: 085...
✅ Full NYC data saved as 'census_data_ny_2023.csv'
In [ ]:
#in the below section, we are going to clean the data and prepare it for a join with the pluto data 
In [69]:
# Read the CSV file
census_df = pd.read_csv("census_data_ny_2023.csv")

# Print the DataFrame
print(census_df)
                                                NAME  B15003_001E  \
0             Census Tract 1; Bronx County; New York       3117.0   
1             Census Tract 2; Bronx County; New York       3735.0   
2             Census Tract 4; Bronx County; New York       4557.0   
3            Census Tract 16; Bronx County; New York       3858.0   
4         Census Tract 19.01; Bronx County; New York       1710.0   
...                                              ...          ...   
2322  Census Tract 303.02; Richmond County; New York       4644.0   
2323  Census Tract 319.01; Richmond County; New York       2288.0   
2324  Census Tract 319.02; Richmond County; New York       2976.0   
2325     Census Tract 323; Richmond County; New York        774.0   
2326    Census Tract 9901; Richmond County; New York          0.0   

      B15003_017E  B15003_018E  B15003_019E  B15003_020E  B15003_021E  \
0          1022.0        501.0         80.0        209.0         60.0   
1           886.0        320.0         26.0        466.0        298.0   
2           887.0         74.0        212.0        490.0        390.0   
3           967.0        105.0        137.0        361.0        448.0   
4           352.0         10.0         35.0        223.0        233.0   
...           ...          ...          ...          ...          ...   
2322        887.0        304.0        206.0        542.0        632.0   
2323        369.0        240.0        112.0        165.0        188.0   
2324        640.0        214.0        167.0        314.0        140.0   
2325        202.0         30.0         15.0        103.0         37.0   
2326          0.0          0.0          0.0          0.0          0.0   

      B15003_022E  B15003_023E  B15003_024E  ...  B05002_013E  B05006_001E  \
0            18.0          7.0          9.0  ...        396.0        396.0   
1           574.0        649.0          0.0  ...       1989.0       1989.0   
2          1118.0        391.0        184.0  ...       1027.0       1027.0   
3           492.0        213.0          0.0  ...       1810.0       1810.0   
4           302.0        206.0         17.0  ...        595.0        595.0   
...           ...          ...          ...  ...          ...          ...   
2322       1169.0        297.0         62.0  ...       2128.0       2128.0   
2323        168.0        185.0          3.0  ...        522.0        522.0   
2324        708.0        234.0          0.0  ...       1122.0       1122.0   
2325        109.0        147.0          0.0  ...        211.0        211.0   
2326          0.0          0.0          0.0  ...          0.0          0.0   

      B05006_005E  B02001_001E  B25003_001E  B25003_002E  B25003_003E  state  \
0             0.0       3538.0          0.0          0.0          0.0     36   
1             0.0       5177.0       1446.0        864.0        582.0     36   
2             0.0       6481.0       2246.0       1376.0        870.0     36   
3             8.0       6011.0       2149.0        496.0       1653.0     36   
4             0.0       2401.0       1092.0          0.0       1092.0     36   
...           ...          ...          ...          ...          ...    ...   
2322          0.0       6502.0       2140.0       1707.0        433.0     36   
2323          0.0       3888.0       1237.0        271.0        966.0     36   
2324          0.0       4502.0       1588.0        833.0        755.0     36   
2325          0.0       1078.0        453.0        320.0        133.0     36   
2326          0.0          0.0          0.0          0.0          0.0     36   

      county   tract  
0          5     100  
1          5     200  
2          5     400  
3          5    1600  
4          5    1901  
...      ...     ...  
2322      85   30302  
2323      85   31901  
2324      85   31902  
2325      85   32300  
2326      85  990100  

[2327 rows x 38 columns]
In [70]:
data_df = pd.DataFrame(data)
print(data_df.sample(5))
                                              NAME  B15003_001E  B15003_017E  \
80  Census Tract 177.02; Richmond County; New York       1844.0        546.0   
45  Census Tract 128.04; Richmond County; New York       3453.0        770.0   
96     Census Tract 213; Richmond County; New York       3620.0        834.0   
53     Census Tract 134; Richmond County; New York       2549.0        629.0   
6       Census Tract 17; Richmond County; New York       1139.0        176.0   

    B15003_018E  B15003_019E  B15003_020E  B15003_021E  B15003_022E  \
80         48.0        266.0        181.0        165.0        404.0   
45        189.0        317.0        371.0        248.0        617.0   
96        501.0         94.0        266.0        240.0        542.0   
53         16.0        116.0        272.0        203.0        654.0   
6          60.0         47.0        116.0        120.0        273.0   

    B15003_023E  B15003_024E  ...  B05002_013E  B05006_001E  B05006_005E  \
80        177.0          9.0  ...        385.0        385.0          0.0   
45        228.0         40.0  ...       1936.0       1936.0          0.0   
96        521.0         87.0  ...       1167.0       1167.0          0.0   
53        284.0         82.0  ...        588.0        588.0          0.0   
6         123.0         84.0  ...        300.0        300.0          0.0   

    B02001_001E  B25003_001E  B25003_002E  B25003_003E  state  county   tract  
80       2338.0        942.0        738.0        204.0     36     085  017702  
45       4643.0       1490.0       1061.0        429.0     36     085  012804  
96       5391.0       1540.0        932.0        608.0     36     085  021300  
53       3660.0       1370.0        934.0        436.0     36     085  013400  
6        1518.0        612.0        296.0        316.0     36     085  001700  

[5 rows x 38 columns]
In [71]:
print(data_df.columns)
Index(['NAME', 'B15003_001E', 'B15003_017E', 'B15003_018E', 'B15003_019E',
       'B15003_020E', 'B15003_021E', 'B15003_022E', 'B15003_023E',
       'B15003_024E', 'B14001_001E', 'B14001_002E', 'B14001_008E',
       'B14002_001E', 'B14002_003E', 'B14002_004E', 'B14007_001E',
       'B14007_003E', 'B19013_001E', 'B19301_001E', 'B17021_001E',
       'B17021_002E', 'B18135_001E', 'B18135_007E', 'B27001_001E',
       'B27001_005E', 'B27001_033E', 'B05002_001E', 'B05002_013E',
       'B05006_001E', 'B05006_005E', 'B02001_001E', 'B25003_001E',
       'B25003_002E', 'B25003_003E', 'state', 'county', 'tract'],
      dtype='object')
In [72]:
import pandas as pd

# Read in the dataset
merged_df = pd.read_csv("census_data_ny_2023.csv")

# Rename columns for readability
rename_dict = {
    "B02001_001E": "population",           # Total population
    "B19013_001E": "medhhinc",             # Median household income
    "B19301_001E": "percapita_income",     # Per capita income
    "B17021_001E": "poverty_total",        # Total population for poverty status
    "B17021_002E": "poverty_below",        # Population below poverty level
    "B25003_001E": "housing_total",        # Total housing units
    "B25003_002E": "housing_owner",        # Owner-occupied housing
    "B25003_003E": "housing_renter",       # Renter-occupied housing
    "B05002_001E": "nativity_total",       # Total population for nativity
    "B05002_013E": "nativity_foreign",     # Foreign-born population
    "B15003_017E": "hs_grad",              # High school graduate
    "B15003_021E": "bachelors_deg",        # Bachelor's degree
    "B15003_022E": "masters_deg",          # Master's degree
    "B15003_023E": "prof_school_deg",      # Professional school degree
    "B15003_024E": "doctorate_deg",        # Doctorate degree
    "B18135_001E": "disability_total",     # Total population with a disability
    "B18135_007E": "disability_under18",   # Under 18 with a disability
    "B27001_001E": "health_ins_total",     # Total population for health insurance
    "B27001_005E": "health_ins_m18_24",    # Males 18–24 with health insurance
    "B27001_033E": "health_ins_f18_24",    # Females 18–24 with health insurance
}

# Apply renaming
merged_df.rename(columns=rename_dict, inplace=True)

# Keep only the renamed columns + tract identifiers
keep_columns = list(rename_dict.values()) + ["NAME", "state", "county", "tract"]
merged_df = merged_df[keep_columns]

# Track the size of the dataset before drops
print("Before cleaning:", len(merged_df))

# Drop rows where median household income is missing or negative
data_df = merged_df.loc[merged_df["medhhinc"] >= 0]

# Drop any remaining rows with missing values
data_df = data_df.dropna()

# Track size after cleaning
print("After cleaning:", len(data_df))

# Calculate derived columns
data_df["povpctind"] = (data_df["poverty_below"] / data_df["poverty_total"]) * 100
data_df["tenpctown"] = (data_df["housing_owner"] / data_df["housing_total"]) * 100
data_df["tenpctren"] = (data_df["housing_renter"] / data_df["housing_total"]) * 100
data_df["natpctfor"] = (data_df["nativity_foreign"] / data_df["nativity_total"]) * 100

# Save cleaned and processed dataset
data_df.to_csv("mycensusdownload.csv", index=False)
print("✅ Cleaned dataset saved as 'mycensusdownload.csv'")
Before cleaning: 2327
After cleaning: 2192
✅ Cleaned dataset saved as 'mycensusdownload.csv'
In [73]:
# Load and verify the Census data file that we previously created using the API
census_df = pd.read_csv('mycensusdownload.csv') 

#############
# What are some ways to preview this dataset? 


# Display basic dataset information
print("Dataset Info:")
census_df.info()

# Display the first few rows of the dataset
print("\nPreview of the dataset:")
census_df.head()


# Display all column names
print("\nColumn Names:")
print(census_df.columns.tolist())
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2192 entries, 0 to 2191
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   population          2192 non-null   float64
 1   medhhinc            2192 non-null   float64
 2   percapita_income    2192 non-null   float64
 3   poverty_total       2192 non-null   float64
 4   poverty_below       2192 non-null   float64
 5   housing_total       2192 non-null   float64
 6   housing_owner       2192 non-null   float64
 7   housing_renter      2192 non-null   float64
 8   nativity_total      2192 non-null   float64
 9   nativity_foreign    2192 non-null   float64
 10  hs_grad             2192 non-null   float64
 11  bachelors_deg       2192 non-null   float64
 12  masters_deg         2192 non-null   float64
 13  prof_school_deg     2192 non-null   float64
 14  doctorate_deg       2192 non-null   float64
 15  disability_total    2192 non-null   float64
 16  disability_under18  2192 non-null   float64
 17  health_ins_total    2192 non-null   float64
 18  health_ins_m18_24   2192 non-null   float64
 19  health_ins_f18_24   2192 non-null   float64
 20  NAME                2192 non-null   object 
 21  state               2192 non-null   int64  
 22  county              2192 non-null   int64  
 23  tract               2192 non-null   int64  
 24  povpctind           2192 non-null   float64
 25  tenpctown           2192 non-null   float64
 26  tenpctren           2192 non-null   float64
 27  natpctfor           2192 non-null   float64
dtypes: float64(24), int64(3), object(1)
memory usage: 479.6+ KB

Preview of the dataset:

Column Names:
['population', 'medhhinc', 'percapita_income', 'poverty_total', 'poverty_below', 'housing_total', 'housing_owner', 'housing_renter', 'nativity_total', 'nativity_foreign', 'hs_grad', 'bachelors_deg', 'masters_deg', 'prof_school_deg', 'doctorate_deg', 'disability_total', 'disability_under18', 'health_ins_total', 'health_ins_m18_24', 'health_ins_f18_24', 'NAME', 'state', 'county', 'tract', 'povpctind', 'tenpctown', 'tenpctren', 'natpctfor']
In [74]:
# Print the 'NAME' column
print(census_df['NAME'])
0               Census Tract 2; Bronx County; New York
1               Census Tract 4; Bronx County; New York
2              Census Tract 16; Bronx County; New York
3           Census Tract 19.01; Bronx County; New York
4           Census Tract 19.02; Bronx County; New York
                             ...                      
2187    Census Tract 303.01; Richmond County; New York
2188    Census Tract 303.02; Richmond County; New York
2189    Census Tract 319.01; Richmond County; New York
2190    Census Tract 319.02; Richmond County; New York
2191       Census Tract 323; Richmond County; New York
Name: NAME, Length: 2192, dtype: object
In [75]:
#print county column
print(census_df['county'])
0        5
1        5
2        5
3        5
4        5
        ..
2187    85
2188    85
2189    85
2190    85
2191    85
Name: county, Length: 2192, dtype: int64
In [76]:
print(census_df['county'].unique())
[ 5 47 61 81 85]
In [77]:
county_name_map = {
    "005": "Bronx County",
    "047": "Kings County",
    "061": "New York County",
    "081": "Queens County",
    "085": "Richmond County"
}
# Ensure county codes are strings
merged_df["county"] = merged_df["county"].astype(str).str.zfill(3)

# Map county codes to readable names
merged_df["county_name"] = merged_df["county"].map(county_name_map)

# Preview the new column
print(merged_df[["county", "county_name"]].drop_duplicates())
     county      county_name
0       005     Bronx County
361     047     Kings County
1166    061  New York County
1476    081    Queens County
2201    085  Richmond County
In [78]:
# Ensure 'NAME' column is treated as a string, replacing NaNs with an empty string
census_df['NAME'] = census_df['NAME'].astype(str).fillna('')

# Extract the census tract from the NAME column (before the semicolon)
census_df['tract_name'] = census_df['NAME'].apply(
    lambda x: x.split(";")[0].strip() if ";" in x else ""
)

# Display the first few rows to verify
print(census_df[['NAME', 'tract_name']].head())
                                         NAME          tract_name
0      Census Tract 2; Bronx County; New York      Census Tract 2
1      Census Tract 4; Bronx County; New York      Census Tract 4
2     Census Tract 16; Bronx County; New York     Census Tract 16
3  Census Tract 19.01; Bronx County; New York  Census Tract 19.01
4  Census Tract 19.02; Bronx County; New York  Census Tract 19.02
In [79]:
# Display unique values in the 'state' column to determine the correct format
print("Unique state values in the dataset:")
print(census_df['state'].unique())
Unique state values in the dataset:
[36]
In [80]:
import re

# Extract numeric portion from 'tract_name' using regex
census_df['tract_number'] = census_df['tract_name'].apply(
    lambda x: re.search(r'\d+(\.\d+)?', x).group() if re.search(r'\d+(\.\d+)?', x) else None
)

# Convert to float for sorting/grouping
census_df['tract_number'] = census_df['tract_number'].astype(float)

print(census_df[['tract_name', 'tract_number']].head())

# Extract borough from county_name if it's available
# (we'll add this only if the column doesn't exist yet)
if 'county_name' not in census_df.columns:
    census_df['county_name'] = census_df['NAME'].apply(
        lambda x: x.split(";")[1].strip() if ";" in x else ""
    )

# Map county_name to borough
borough_map = {
    "New York County": "Manhattan",
    "Kings County": "Brooklyn",
    "Bronx County": "Bronx",
    "Queens County": "Queens",
    "Richmond County": "Staten Island"
}

census_df['borough'] = census_df['county_name'].map(borough_map)

print(census_df[['county_name', 'borough']].drop_duplicates())
           tract_name  tract_number
0      Census Tract 2          2.00
1      Census Tract 4          4.00
2     Census Tract 16         16.00
3  Census Tract 19.01         19.01
4  Census Tract 19.02         19.02
          county_name        borough
0        Bronx County          Bronx
334      Kings County       Brooklyn
1104  New York County      Manhattan
1401    Queens County         Queens
2075  Richmond County  Staten Island
In [81]:
#print unique county names
print("Number of unique counties:", census_df['county_name'].nunique())
Number of unique counties: 5
In [82]:
print("Available columns in data_df:")
print(data_df.columns)
print(merged_df["tract"])
Available columns in data_df:
Index(['population', 'medhhinc', 'percapita_income', 'poverty_total',
       'poverty_below', 'housing_total', 'housing_owner', 'housing_renter',
       'nativity_total', 'nativity_foreign', 'hs_grad', 'bachelors_deg',
       'masters_deg', 'prof_school_deg', 'doctorate_deg', 'disability_total',
       'disability_under18', 'health_ins_total', 'health_ins_m18_24',
       'health_ins_f18_24', 'NAME', 'state', 'county', 'tract', 'povpctind',
       'tenpctown', 'tenpctren', 'natpctfor'],
      dtype='object')
0          100
1          200
2          400
3         1600
4         1901
         ...  
2322     30302
2323     31901
2324     31902
2325     32300
2326    990100
Name: tract, Length: 2327, dtype: int64
In [83]:
#taking a look at pluto Data

# Define the file path (adjust if needed)
file_path = "nyc_pluto_24v4_1_csv/pluto_24v4_1.csv"

# Load the PLUTO dataset
pluto_df = pd.read_csv(file_path)

# Display basic info
print("PLUTO Dataset Info:")
pluto_df.info()

# Show the first few rows
pluto_df.head()
/var/folders/7w/1y1zc9c95qnc86tchl3m3yr00000gn/T/ipykernel_9002/2068003549.py:7: DtypeWarning: Columns (21,22,24,26,28,65,66) have mixed types. Specify dtype option on import or set low_memory=False.
  pluto_df = pd.read_csv(file_path)
PLUTO Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858642 entries, 0 to 858641
Data columns (total 92 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   borough               858642 non-null  object 
 1   block                 858642 non-null  int64  
 2   lot                   858642 non-null  int64  
 3   cd                    857534 non-null  float64
 4   bct2020               857449 non-null  float64
 5   bctcb2020             857449 non-null  float64
 6   ct2010                857449 non-null  float64
 7   cb2010                857449 non-null  float64
 8   schooldist            856807 non-null  float64
 9   council               857532 non-null  float64
 10  zipcode               856795 non-null  float64
 11  firecomp              856788 non-null  object 
 12  policeprct            856802 non-null  float64
 13  healthcenterdistrict  856802 non-null  float64
 14  healtharea            856802 non-null  float64
 15  sanitboro             856603 non-null  float64
 16  sanitdistrict         856603 non-null  float64
 17  sanitsub              856483 non-null  object 
 18  address               858294 non-null  object 
 19  zonedist1             856961 non-null  object 
 20  zonedist2             19654 non-null   object 
 21  zonedist3             214 non-null     object 
 22  zonedist4             12 non-null      object 
 23  overlay1              74482 non-null   object 
 24  overlay2              183 non-null     object 
 25  spdist1               105653 non-null  object 
 26  spdist2               300 non-null     object 
 27  spdist3               0 non-null       float64
 28  ltdheight             3045 non-null    object 
 29  splitzone             856961 non-null  object 
 30  bldgclass             858297 non-null  object 
 31  landuse               855871 non-null  float64
 32  easements             858296 non-null  float64
 33  ownertype             36006 non-null   object 
 34  ownername             857773 non-null  object 
 35  lotarea               857791 non-null  float64
 36  bldgarea              858256 non-null  float64
 37  comarea               811661 non-null  float64
 38  resarea               811661 non-null  float64
 39  officearea            811661 non-null  float64
 40  retailarea            811661 non-null  float64
 41  garagearea            811661 non-null  float64
 42  strgearea             811661 non-null  float64
 43  factryarea            811661 non-null  float64
 44  otherarea             811661 non-null  float64
 45  areasource            858296 non-null  float64
 46  numbldgs              857793 non-null  float64
 47  numfloors             815803 non-null  float64
 48  unitsres              858146 non-null  float64
 49  unitstotal            858147 non-null  float64
 50  lotfront              857791 non-null  float64
 51  lotdepth              857791 non-null  float64
 52  bldgfront             857791 non-null  float64
 53  bldgdepth             857791 non-null  float64
 54  ext                   795788 non-null  object 
 55  proxcode              858296 non-null  float64
 56  irrlotcode            858296 non-null  object 
 57  lottype               858296 non-null  float64
 58  bsmtcode              858296 non-null  float64
 59  assessland            858296 non-null  float64
 60  assesstot             858296 non-null  float64
 61  exempttot             858296 non-null  float64
 62  yearbuilt             858296 non-null  float64
 63  yearalter1            858296 non-null  float64
 64  yearalter2            858296 non-null  float64
 65  histdist              32536 non-null   object 
 66  landmark              1456 non-null    object 
 67  builtfar              857414 non-null  float64
 68  residfar              858642 non-null  float64
 69  commfar               858642 non-null  float64
 70  facilfar              858642 non-null  float64
 71  borocode              858642 non-null  int64  
 72  bbl                   858642 non-null  float64
 73  condono               10441 non-null   float64
 74  tract2010             857449 non-null  float64
 75  xcoord                857421 non-null  float64
 76  ycoord                857421 non-null  float64
 77  zonemap               857021 non-null  object 
 78  zmcode                15636 non-null   object 
 79  sanborn               856927 non-null  object 
 80  taxmap                856927 non-null  float64
 81  edesignum             10049 non-null   object 
 82  appbbl                96703 non-null   float64
 83  appdate               96703 non-null   object 
 84  plutomapid            858642 non-null  int64  
 85  firm07_flag           34677 non-null   float64
 86  pfirm15_flag          65750 non-null   float64
 87  version               858642 non-null  object 
 88  dcpedited             42097 non-null   object 
 89  latitude              857421 non-null  float64
 90  longitude             857421 non-null  float64
 91  notes                 0 non-null       float64
dtypes: float64(60), int64(4), object(28)
memory usage: 602.7+ MB
Out[83]:
borough block lot cd bct2020 bctcb2020 ct2010 cb2010 schooldist council ... appbbl appdate plutomapid firm07_flag pfirm15_flag version dcpedited latitude longitude notes
0 BK 7604 47 314.0 3075200.0 3.075200e+10 752.0 1004.0 22.0 45.0 ... NaN NaN 1 NaN NaN 24v4.1 NaN 40.625779 -73.953658 NaN
1 BK 1304 1 309.0 3032300.0 3.032300e+10 323.0 3002.0 17.0 35.0 ... 3.013040e+09 10/20/1989 1 NaN NaN 24v4.1 NaN 40.664671 -73.956901 NaN
2 BK 1304 58 309.0 3032300.0 3.032300e+10 323.0 3001.0 17.0 35.0 ... NaN NaN 1 NaN NaN 24v4.1 NaN 40.665104 -73.955675 NaN
3 BK 1294 1 309.0 3032500.0 3.032500e+10 325.0 3000.0 17.0 35.0 ... 3.012940e+09 02/27/1989 1 NaN NaN 24v4.1 NaN 40.666423 -73.958074 NaN
4 BK 1302 1 309.0 3032500.0 3.032500e+10 325.0 3001.0 17.0 35.0 ... NaN NaN 1 NaN NaN 24v4.1 NaN 40.665062 -73.958101 NaN

5 rows × 92 columns

In [84]:
##in this section we are looking to see if there are ways to join the pluto and census data
#print bct (from pluto)
print(pluto_df["bct2020"])

#print tract from census
print(merged_df["tract"])
0         3075200.0
1         3032300.0
2         3032300.0
3         3032500.0
4         3032500.0
            ...    
858637    5020803.0
858638    2033700.0
858639    3097000.0
858640    2041400.0
858641    3029000.0
Name: bct2020, Length: 858642, dtype: float64
0          100
1          200
2          400
3         1600
4         1901
         ...  
2322     30302
2323     31901
2324     31902
2325     32300
2326    990100
Name: tract, Length: 2327, dtype: int64
In [87]:
# Convert both columns to string and sets
pluto_tracts = set(pluto_df["bct2020"].astype(str))
census_tracts = set(merged_df["tract"].astype(str))
In [88]:
# Find intersection
matching_tracts = pluto_tracts.intersection(census_tracts)

# Print results
print(f"✅ Matching tracts: {len(matching_tracts)}")
print("Sample matches:", list(matching_tracts)[:10])  # Show first 10 matches
✅ Matching tracts: 0
Sample matches: []
In [89]:
#it looks like there are not matches because the data is in different formats 
#we need to remove the last three digits, including the decimal point from the bct2020 column and try again
In [90]:
# Step 1: Convert to string
pluto_df["clean_bct"] = pluto_df["bct2020"].astype(str)

# Step 2: Remove trailing '00.0' from the end
pluto_df["clean_bct"] = pluto_df["clean_bct"].str.replace("00.0$", "", regex=True)

# Preview
print(pluto_df[["bct2020", "clean_bct"]].head())
     bct2020 clean_bct
0  3075200.0     30752
1  3032300.0     30323
2  3032300.0     30323
3  3032500.0     30325
4  3032500.0     30325
In [91]:
#lets make a new column to hold these new values: # Step 1: Convert to string
pluto_df["clean_bct"] = pluto_df["bct2020"].astype(str)

# Step 2: Remove trailing '00.0'
pluto_df["clean_bct"] = pluto_df["clean_bct"].str.replace("00.0$", "", regex=True)

# Step 3: Ensure it's zero-padded to 6 digits (optional, but recommended for matching)
pluto_df["clean_bct"] = pluto_df["clean_bct"].str.zfill(6)

# ✅ Preview cleaned values
print(pluto_df[["bct2020", "clean_bct"]].head())
     bct2020 clean_bct
0  3075200.0    030752
1  3032300.0    030323
2  3032300.0    030323
3  3032500.0    030325
4  3032500.0    030325
In [92]:
# Convert to sets of strings for comparison
pluto_tracts = set(pluto_df["clean_bct"])
census_tracts = set(merged_df["tract"].astype(str).str.zfill(6))

# Find intersection
matching_tracts = pluto_tracts.intersection(census_tracts)

# Print results
print(f"✅ Matching tracts: {len(matching_tracts)}")
print("Sample matches:", list(matching_tracts)[:10])
✅ Matching tracts: 25
Sample matches: ['030900', '050201', '020200', '031200', '020300', '030600', '030301', '040100', '030300', '040101']
In [93]:
#merge data and export to look for possible cluster analysis 
merged_df["tract_str"] = merged_df["tract"].astype(str).str.zfill(6)

# Perform the inner merge to keep only matching tracts
merged_data = pd.merge(
    pluto_df,
    merged_df,
    left_on="clean_bct",
    right_on="tract_str",
    how="inner"
)

# ✅ Save to CSV
merged_data.to_csv("merged_pluto_census_matched.csv", index=False)
print(f"✅ Merged dataset saved with {merged_data.shape[0]} matching rows.")
✅ Merged dataset saved with 18917 matching rows.
In [66]:
# Step 1: Group PLUTO data by bct2020 and take average of numeric fields
pluto_tract_df = pluto_df.groupby("bct2020").mean(numeric_only=True).reset_index()

# Step 2: Clean bct2020 for joining
pluto_tract_df["clean_bct"] = (
    pluto_tract_df["bct2020"]
    .astype(str)
    .str.replace("00.0$", "", regex=True)
    .str.zfill(6)
)

# Step 3: Prepare census tract column
merged_df["tract_str"] = merged_df["tract"].astype(str).str.zfill(6)

# Step 4: Merge on matching tracts
merged_data = pd.merge(
    pluto_tract_df,
    merged_df,
    left_on="clean_bct",
    right_on="tract_str",
    how="inner"
)

# Step 5: Export
merged_data.to_csv("merged_pluto_census_by_tract.csv", index=False)
print(f"✅ Exported merged data with shape: {merged_data.shape}")
✅ Exported merged data with shape: (42, 91)
In [96]:
# what we are trying to do with the following cluster analysis:

''' Goal of Clustering 
	•	 Which neighborhoods are demographically or socioeconomically similar?
	•	Are there clusters of high-need areas that share poverty + low education + health disparities?
	•	Which tracts might be more vulnerable or more affluent?
	•	Can we identify “types” of tracts — like high-density, low-income, high-disability areas? '''
Out[96]:
' Goal of Clustering \n\t•\t Which neighborhoods are demographically or socioeconomically similar?\n\t•\tAre there clusters of high-need areas that share poverty + low education + health disparities?\n\t•\tWhich tracts might be more vulnerable or more affluent?\n\t•\tCan we identify “types” of tracts — like high-density, low-income, high-disability areas? '
In [135]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Load your merged dataset
df = pd.read_csv("merged_pluto_census_by_tract.csv")

# -------------------------------
# 🎯 Step 1: Select & Engineer Features
# -------------------------------
df["poverty_rate"] = df["poverty_below"] / df["poverty_total"]
df["renter_rate"] = df["housing_renter"] / df["housing_total"]

features = [
    "population",
    "housing_total",
    "renter_rate",
    "medhhinc",
    "percapita_income",
    "poverty_rate",
    "hs_grad",
    "bachelors_deg",
    "disability_total",
    "disability_under18",
    "health_ins_total"
]

data = df[features].dropna()

# -------------------------------
# 🔍 Step 2: Standardize Data
# -------------------------------
scaler = StandardScaler()
scaled = scaler.fit_transform(data)

# -------------------------------
# 📊 Step 3: PCA for Visualization
# -------------------------------
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled)

pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2"])

# -------------------------------
# 🔀 Step 4: KMeans Clustering
# -------------------------------
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(scaled)

pca_df["Cluster"] = clusters

# -------------------------------
# 🖼️ Step 5: Plot Clusters
# -------------------------------
plt.figure(figsize=(8, 6))
sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Cluster", palette="tab10")
plt.title("Census Tract Clusters (PCA)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster")
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [148]:
# Save the cluster assignments to the original df
df["Cluster"] = clusters  # Ensure cluster assignments are added to the original dataframe
In [136]:
'''The scatter plot presented is a visual summary of the cluster analysis performed on New York City census tracts. 
Each point on the plot represents a single tract, with its position determined through Principal Component Analysis (PCA), 
which reduces a set of eleven socio-economic and demographic variables—such as median household income, renter rates, education levels,
and disability counts—into two principal components. These components, while not directly interpretable, capture the greatest variation 
across all tracts, allowing for easier visualization of high-dimensional data.

The colors assigned to each point indicate cluster membership as defined by the KMeans clustering algorithm. 
In this case, the data has been segmented into four distinct clusters. One cluster, shown in orange (Cluster 1), 
appears densely packed in the center of the plot, suggesting a group of census tracts with similar characteristics. 
In contrast, other clusters—such as those shown in blue, green, and red—are more sparsely distributed or located at the edges, 
indicating that those tracts differ significantly from the central group in terms of their socio-economic profiles.

This PCA plot provides a high-level overview of how census tracts are grouped based on shared attributes. Outliers, 
such as the red point located far to the right or the blue points positioned below the central cluster, represent tracts that
are particularly distinct in the dataset. To further interpret the meaning of each cluster, the next step involves analyzing the average 
values of the selected variables within each group. This allows for the identification of defining traits—such as high poverty rates, 
low educational attainment, or elevated renter populations—and supports more informed mapping and policy analysis across different
neighborhoods in the city.'''
Out[136]:
'The scatter plot presented is a visual summary of the cluster analysis performed on New York City census tracts. \nEach point on the plot represents a single tract, with its position determined through Principal Component Analysis (PCA), \nwhich reduces a set of eleven socio-economic and demographic variables—such as median household income, renter rates, education levels,\nand disability counts—into two principal components. These components, while not directly interpretable, capture the greatest variation \nacross all tracts, allowing for easier visualization of high-dimensional data.\n\nThe colors assigned to each point indicate cluster membership as defined by the KMeans clustering algorithm. \nIn this case, the data has been segmented into four distinct clusters. One cluster, shown in orange (Cluster 1), \nappears densely packed in the center of the plot, suggesting a group of census tracts with similar characteristics. \nIn contrast, other clusters—such as those shown in blue, green, and red—are more sparsely distributed or located at the edges, \nindicating that those tracts differ significantly from the central group in terms of their socio-economic profiles.\n\nThis PCA plot provides a high-level overview of how census tracts are grouped based on shared attributes. Outliers, \nsuch as the red point located far to the right or the blue points positioned below the central cluster, represent tracts that\nare particularly distinct in the dataset. To further interpret the meaning of each cluster, the next step involves analyzing the average \nvalues of the selected variables within each group. This allows for the identification of defining traits—such as high poverty rates, \nlow educational attainment, or elevated renter populations—and supports more informed mapping and policy analysis across different\nneighborhoods in the city.'
In [137]:
#The table shows the mean values of each variable (like income, poverty, education, etc.) for the tracts in each cluster. 
#By comparing these averages across clusters, we can spot patterns and differences that help describe the character of each group.

df.groupby("Cluster")[features].mean()
Out[137]:
population housing_total renter_rate medhhinc percapita_income poverty_rate hs_grad bachelors_deg disability_total disability_under18 health_ins_total
Cluster
0.0 5736.571429 2119.428571 0.501938 80901.714286 38254.428571 0.128852 1088.142857 446.142857 5724.000000 0.0 5724.000000
1.0 3094.774194 1075.806452 0.609677 84023.645161 39811.709677 0.146679 491.548387 147.870968 3051.774194 0.0 3051.774194
2.0 1383.666667 714.666667 0.757479 124717.666667 115156.333333 0.103999 77.333333 26.333333 1382.000000 0.0 1382.000000
3.0 10529.000000 3585.000000 0.949233 44217.000000 19785.000000 0.386551 1041.000000 638.000000 10529.000000 0.0 10529.000000
In [138]:
'''The table presents the average socio-economic characteristics of each cluster, offering insight into the distinct profiles of the census tracts grouped by the analysis. Cluster 0 represents middle-income neighborhoods with a balanced mix of renters and homeowners, moderate educational attainment, and a sizable population with disabilities. Cluster 1 includes working-class tracts with moderate income and higher renter rates, slightly elevated poverty levels, and lower levels of educational attainment. Cluster 2 stands out as a high-income, low-density group with the highest median and per capita income, but surprisingly low counts of high school and bachelor’s degree holders, suggesting either small populations or unique demographic compositions. Cluster 3 comprises high-density, low-income areas with the highest renter rates, poverty levels, and disability counts—characteristics often associated with communities experiencing concentrated disadvantage. Together, these clusters reveal significant variation in housing, income, education, and health across New York City census tracts, providing a foundation for spatial analysis and policy targeting.'''
Out[138]:
'The table presents the average socio-economic characteristics of each cluster, offering insight into the distinct profiles of the census tracts grouped by the analysis. Cluster 0 represents middle-income neighborhoods with a balanced mix of renters and homeowners, moderate educational attainment, and a sizable population with disabilities. Cluster 1 includes working-class tracts with moderate income and higher renter rates, slightly elevated poverty levels, and lower levels of educational attainment. Cluster 2 stands out as a high-income, low-density group with the highest median and per capita income, but surprisingly low counts of high school and bachelor’s degree holders, suggesting either small populations or unique demographic compositions. Cluster 3 comprises high-density, low-income areas with the highest renter rates, poverty levels, and disability counts—characteristics often associated with communities experiencing concentrated disadvantage. Together, these clusters reveal significant variation in housing, income, education, and health across New York City census tracts, providing a foundation for spatial analysis and policy targeting.'
In [139]:
#Compare the average value of each variable across clusters

cluster_means = df.groupby("Cluster")[features].mean().T
cluster_means.plot(kind="bar", figsize=(14, 6), rot=45)
plt.title("Cluster Profiles Across Variables")
plt.ylabel("Mean Value")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [140]:
#Show the distribution (spread + outliers) of variables like income or renter rate per cluster

import seaborn as sns
sns.boxplot(data=df, x="Cluster", y="medhhinc")
plt.title("Median Household Income by Cluster")
plt.show()
No description has been provided for this image
In [141]:
#Show all cluster averages as a color-coded heatmap

import seaborn as sns
sns.heatmap(cluster_means, annot=True, cmap="coolwarm")
plt.title("Cluster Variable Averages (Heatmap)")
plt.show()
No description has been provided for this image
In [142]:
'''The visualizations provide a clear and compelling view of the socio-economic differences among the identified clusters. The bar chart comparing cluster profiles highlights stark contrasts, with Cluster 2 emerging as the wealthiest group, exhibiting the highest median and per capita incomes, while Cluster 3 stands out as the most disadvantaged, with the highest renter rates, poverty levels, and disability counts, alongside the lowest income figures. Cluster 0 appears to represent a moderate-income group with relatively high educational attainment, whereas Cluster 1 shows a similar income profile but with a higher proportion of renters and lower education levels. The boxplot of median household income further illustrates the distribution within each cluster, confirming that Cluster 2 contains consistently high-income tracts, while Cluster 3 is both the lowest in income and the least economically diverse. Finally, the heatmap of cluster means reinforces these patterns by visually emphasizing variables that distinguish each group. Cluster 2 consistently scores high in income and low in poverty, Cluster 3 inverts this trend, and Clusters 0 and 1 display more balanced or transitional characteristics. Together, these charts provide strong evidence of socio-spatial inequality and offer a data-driven foundation for further mapping and analysis.'''
Out[142]:
'The visualizations provide a clear and compelling view of the socio-economic differences among the identified clusters. The bar chart comparing cluster profiles highlights stark contrasts, with Cluster 2 emerging as the wealthiest group, exhibiting the highest median and per capita incomes, while Cluster 3 stands out as the most disadvantaged, with the highest renter rates, poverty levels, and disability counts, alongside the lowest income figures. Cluster 0 appears to represent a moderate-income group with relatively high educational attainment, whereas Cluster 1 shows a similar income profile but with a higher proportion of renters and lower education levels. The boxplot of median household income further illustrates the distribution within each cluster, confirming that Cluster 2 contains consistently high-income tracts, while Cluster 3 is both the lowest in income and the least economically diverse. Finally, the heatmap of cluster means reinforces these patterns by visually emphasizing variables that distinguish each group. Cluster 2 consistently scores high in income and low in poverty, Cluster 3 inverts this trend, and Clusters 0 and 1 display more balanced or transitional characteristics. Together, these charts provide strong evidence of socio-spatial inequality and offer a data-driven foundation for further mapping and analysis.'
In [143]:
import pandas as pd
import folium
from IPython.display import display

# Step 1: Load the CSV with cluster data
df = pd.read_csv("clustered_census_data.csv")  # Use your actual CSV file path

# Step 2: Check the structure of the CSV (ensure lat, lon, and cluster columns are present)
print(df.head())

# Step 3: Initialize a base Folium map centered on NYC
m = folium.Map(location=[40.7128, -74.0060], zoom_start=11)

# Step 4: Add markers for each row in the CSV
for _, row in df.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],  # Assuming lat/lon columns
        radius=5,
        color="blue" if row["Cluster"] == 0 else
               "orange" if row["Cluster"] == 1 else
               "green" if row["Cluster"] == 2 else
               "red",  # Color based on cluster
        fill=True,
        fill_color="blue" if row["Cluster"] == 0 else
                    "orange" if row["Cluster"] == 1 else
                    "green" if row["Cluster"] == 2 else
                    "red",
        fill_opacity=0.7,
    ).add_to(m)

# Step 5: Display the map in Jupyter or save it as HTML for use
display(m)  # For Jupyter/Colab
m.save("nyc_clusters_map.html")  # For a standalone map
     bct2020        block          lot          cd     bctcb2020  ct2010  \
0  1010000.0  1307.977528  1229.640449  105.617978  1.010000e+10   100.0   
1  1010000.0  1307.977528  1229.640449  105.617978  1.010000e+10   100.0   
2  1010000.0  1307.977528  1229.640449  105.617978  1.010000e+10   100.0   
3  1020000.0  1720.403974   326.062914  110.000000  1.020000e+10   200.0   
4  1020000.0  1720.403974   326.062914  110.000000  1.020000e+10   200.0   

        cb2010  schooldist  council       zipcode  ...  health_ins_f18_24  \
0  1620.000000    2.000000      4.0  10023.123596  ...                0.0   
1  1620.000000    2.000000      4.0  10023.123596  ...                0.0   
2  1620.000000    2.000000      4.0  10023.123596  ...                0.0   
3  1610.804636    4.019868      9.0  10027.357616  ...                0.0   
4  1610.804636    4.019868      9.0  10027.357616  ...                0.0   

                                          NAME  state  county  tract  \
0     Census Tract 101; Kings County; New York     36      47  10100   
1  Census Tract 101; New York County; New York     36      61  10100   
2    Census Tract 101; Queens County; New York     36      81  10100   
3     Census Tract 102; Kings County; New York     36      47  10200   
4  Census Tract 102; New York County; New York     36      61  10200   

       county_name  tract_str  poverty_rate  renter_rate  Cluster  
0     Kings County      10100      0.161742     0.693609      1.0  
1  New York County      10100      0.160166     1.000000      2.0  
2    Queens County      10100      0.045193     0.683664      1.0  
3     Kings County      10200      0.212007     0.632509      1.0  
4  New York County      10200      0.069672     0.591195      2.0  

[5 rows x 94 columns]
Make this Notebook Trusted to load map: File -> Trust Notebook
In [144]:
'The map visualizes the distribution of different clusters across New York City and its surroundings, with each cluster represented by distinct colored markers: blue for Cluster 0, orange for Cluster 1, and red for Cluster 2. The clusters are spread across various geographic areas, with Cluster 0 showing a more scattered distribution, potentially indicating areas with high income or low density. Cluster 1 appears more concentrated, likely representing areas with moderate income or a higher proportion of renters. Cluster 2, marked in red, highlights areas with more extreme socio-economic factors, such as high poverty and disability rates. The map offers a geographic insight into how these clusters align with urban, residential, or low-income areas.'''
Out[144]:
'The map visualizes the distribution of different clusters across New York City and its surroundings, with each cluster represented by distinct colored markers: blue for Cluster 0, orange for Cluster 1, and red for Cluster 2. The clusters are spread across various geographic areas, with Cluster 0 showing a more scattered distribution, potentially indicating areas with high income or low density. Cluster 1 appears more concentrated, likely representing areas with moderate income or a higher proportion of renters. Cluster 2, marked in red, highlights areas with more extreme socio-economic factors, such as high poverty and disability rates. The map offers a geographic insight into how these clusters align with urban, residential, or low-income areas.'
In [149]:
print(df.columns)
Index(['bct2020', 'block', 'lot', 'cd', 'bctcb2020', 'ct2010', 'cb2010',
       'schooldist', 'council', 'zipcode', 'policeprct',
       'healthcenterdistrict', 'healtharea', 'sanitboro', 'sanitdistrict',
       'spdist3', 'landuse', 'easements', 'lotarea', 'bldgarea', 'comarea',
       'resarea', 'officearea', 'retailarea', 'garagearea', 'strgearea',
       'factryarea', 'otherarea', 'areasource', 'numbldgs', 'numfloors',
       'unitsres', 'unitstotal', 'lotfront', 'lotdepth', 'bldgfront',
       'bldgdepth', 'proxcode', 'lottype', 'bsmtcode', 'assessland',
       'assesstot', 'exempttot', 'yearbuilt', 'yearalter1', 'yearalter2',
       'builtfar', 'residfar', 'commfar', 'facilfar', 'borocode', 'bbl',
       'condono', 'tract2010', 'xcoord', 'ycoord', 'taxmap', 'appbbl',
       'plutomapid', 'firm07_flag', 'pfirm15_flag', 'latitude', 'longitude',
       'notes', 'clean_bct', 'population', 'medhhinc', 'percapita_income',
       'poverty_total', 'poverty_below', 'housing_total', 'housing_owner',
       'housing_renter', 'nativity_total', 'nativity_foreign', 'hs_grad',
       'bachelors_deg', 'masters_deg', 'prof_school_deg', 'doctorate_deg',
       'disability_total', 'disability_under18', 'health_ins_total',
       'health_ins_m18_24', 'health_ins_f18_24', 'NAME', 'state', 'county',
       'tract', 'county_name', 'tract_str', 'poverty_rate', 'renter_rate',
       'Cluster'],
      dtype='object')
In [155]:
# Check the distribution of clusters in the dataset
print(df["Cluster"].value_counts())
Cluster
1    31
0     7
2     3
3     1
Name: count, dtype: int64
In [151]:
#experiment: predict the cluster a census tract will fall into based on its socio-econ features

from sklearn.ensemble import RandomForestClassifier

# Define the features and target
X = df[["poverty_below", "housing_renter", "bachelors_deg",'health_ins_total']]  
y = df["Cluster"]  # Target variable (cluster labels)

# Initialize the model and fit it
model = RandomForestClassifier()
model.fit(X, y)

# Predict the cluster for new data
predictions = model.predict(X)  # Or new data, if available
print(predictions)
[1 2 1 1 2 1 1 1 1 2 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 1 1 1 3 1 1 1 1 1 1 1 1
 0 1 1 1 1]
In [153]:
#evaluate the model
from sklearn.metrics import accuracy_score, classification_report

# Evaluate accuracy
accuracy = accuracy_score(y, predictions)
print(f"Accuracy: {accuracy}")

# Detailed classification report
print(classification_report(y, predictions))
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00        31
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         1

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42

In [154]:
# Get feature importance
feature_importance = model.feature_importances_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
})

# Sort features by importance
importance_df = importance_df.sort_values(by="Importance", ascending=False)
print(importance_df)
            Feature  Importance
2     bachelors_deg    0.446122
3  health_ins_total    0.355072
1    housing_renter    0.122080
0     poverty_below    0.076726
In [152]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Fit a KMeans model to your data (use the same features as above)
kmeans = KMeans(n_clusters=4, random_state=42)  # Adjust n_clusters if needed
kmeans.fit(X)

# Calculate silhouette score to measure the quality of clusters
sil_score = silhouette_score(X, kmeans.labels_)
print(f"Silhouette Score: {sil_score}")
Silhouette Score: 0.481637533817962
In [158]:
'''#This indicates that the clusters are moderately well-defined, but not perfectly. A higher silhouette score closer to 1 would
#indicate better separation between clusters. A score around 0.48 suggests that while the clusters are somewhat distinct, 
#there’s room for improvement in the clustering process.

implications for Cluster Analysis:

Class Imbalance: The imbalance in cluster distribution could have affected the model’s ability to predict less frequent clusters 
(Clusters 2 and 3). You might want to consider balancing the data or using techniques like class weighting or sampling to improve 
predictions for these underrepresented clusters.

Overfitting Concern: The model’s perfect accuracy could suggest overfitting to the training data. If you apply the model to new data,
the performance might not be as good. Evaluating it on test data (or using cross-validation) will provide a clearer picture of its 
generalization ability.

Feature Importance: The model heavily relies on features like education and health insurance, which is useful information if
you’re interpreting socio-economic trends. It might be worth investigating why these variables are so important in distinguishing clusters.

Cluster Separation: The silhouette score shows that while your clusters have some degree of separation, the model could be 
improved by tuning parameters like the number of clusters or by exploring different clustering techniques (e.g., hierarchical clustering).'''
Out[158]:
'#This indicates that the clusters are moderately well-defined, but not perfectly. A higher silhouette score closer to 1 would\n#indicate better separation between clusters. A score around 0.48 suggests that while the clusters are somewhat distinct, \n#there’s room for improvement in the clustering process.\n\nimplications for Cluster Analysis:\n\nClass Imbalance: The imbalance in cluster distribution could have affected the model’s ability to predict less frequent clusters \n(Clusters 2 and 3). You might want to consider balancing the data or using techniques like class weighting or sampling to improve \npredictions for these underrepresented clusters.\n\nOverfitting Concern: The model’s perfect accuracy could suggest overfitting to the training data. If you apply the model to new data,\nthe performance might not be as good. Evaluating it on test data (or using cross-validation) will provide a clearer picture of its \ngeneralization ability.\n\nFeature Importance: The model heavily relies on features like education and health insurance, which is useful information if\nyou’re interpreting socio-economic trends. It might be worth investigating why these variables are so important in distinguishing clusters.\n\nCluster Separation: The silhouette score shows that while your clusters have some degree of separation, the model could be \nimproved by tuning parameters like the number of clusters or by exploring different clustering techniques (e.g., hierarchical clustering).'
In [160]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Define the features
X = df[["poverty_below", "housing_renter", "bachelors_deg", "health_ins_total"]]  # Adjust features if necessary

# Range of clusters to test (e.g., 2 to 10 clusters)
cluster_range = range(2, 11)

# List to store silhouette scores for each number of clusters
sil_scores = []

# Loop through different cluster numbers
for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    
    # Calculate the silhouette score for each clustering result
    score = silhouette_score(X, kmeans.labels_)
    sil_scores.append(score)

# Plot silhouette scores for different numbers of clusters
plt.figure(figsize=(8, 6))
plt.plot(cluster_range, sil_scores, marker='o', linestyle='--')
plt.title("Silhouette Score for Different Numbers of Clusters")
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.tight_layout()
plt.show()

# Print the silhouette scores for each cluster count
for n_clusters, score in zip(cluster_range, sil_scores):
    print(f"Clusters: {n_clusters}, Silhouette Score: {score}")
No description has been provided for this image
Clusters: 2, Silhouette Score: 0.5022058123253923
Clusters: 3, Silhouette Score: 0.47293931184825605
Clusters: 4, Silhouette Score: 0.481637533817962
Clusters: 5, Silhouette Score: 0.3986128988890994
Clusters: 6, Silhouette Score: 0.39090080881248856
Clusters: 7, Silhouette Score: 0.4045111243251314
Clusters: 8, Silhouette Score: 0.4020455840111953
Clusters: 9, Silhouette Score: 0.3908044035886829
Clusters: 10, Silhouette Score: 0.37627035168471223
In [161]:
'''the above: 
	•	tests different cluster numbers from 2 to 10.
	•	calculates the silhouette score for each cluster number.
	•	visualizes the silhouette scores to help us choose the best number of clusters based on the highest score.'''
Out[161]:
'the above: \n\t•\tIt tests different cluster numbers from 2 to 10.\n\t•\tIt calculates the silhouette score for each cluster number.\n\t•\tIt visualizes the silhouette scores to help us choose the best number of clusters based on the highest score.'
In [164]:
from sklearn.cluster import DBSCAN

# Initialize DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(X)

# Visualize the results (using PCA for 2D visualization)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_labels, cmap='viridis', marker='o')
plt.title("DBSCAN Clustering")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()

'''DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm that identifies clusters based on the density of points in the feature space. Unlike KMeans, which requires specifying the number of clusters beforehand, DBSCAN has two main parameters that guide the clustering process:'''
No description has been provided for this image
Out[164]:
'DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm that identifies clusters based on the density of points in the feature space. Unlike KMeans, which requires specifying the number of clusters beforehand, DBSCAN has two main parameters that guide the clustering process:'
In [171]:
from sklearn.cluster import AgglomerativeClustering

# Initialize AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=4)  # You can change the number of clusters
agglo_labels = agglo.fit_predict(X)

# Visualize the results (using PCA for 2D visualization)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=agglo_labels, cmap='viridis', marker='o')
plt.title("Agglomerative Clustering")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [172]:
#lets redo our analysis by using the algorithmic clustering
# Calculate the mean of each feature for each cluster
cluster_means = df.groupby("Cluster")[["poverty_below", "housing_renter", "bachelors_deg", "health_ins_total", "population", "housing_total", "renter_rate", "medhhinc", "percapita_income"]].mean()

# Print the cluster means to interpret them
print(cluster_means)
         poverty_below  housing_renter  bachelors_deg  health_ins_total  \
Cluster                                                                   
0           753.571429     1161.142857     446.142857       5724.000000   
1           470.935484      694.193548     147.870968       3051.774194   
2           162.666667      600.666667      26.333333       1382.000000   
3          4070.000000     3403.000000     638.000000      10529.000000   

           population  housing_total  renter_rate       medhhinc  \
Cluster                                                            
0         5736.571429    2119.428571     0.501938   80901.714286   
1         3094.774194    1075.806452     0.609677   84023.645161   
2         1383.666667     714.666667     0.757479  124717.666667   
3        10529.000000    3585.000000     0.949233   44217.000000   

         percapita_income  
Cluster                    
0            38254.428571  
1            39811.709677  
2           115156.333333  
3            19785.000000  
In [173]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot a bar chart for each feature
plt.figure(figsize=(14, 10))

# Plot the average values of each feature across clusters
cluster_means.plot(kind="bar", figsize=(14, 8), colormap="Set3")
plt.title("Average Values of Each Feature Across Clusters")
plt.xlabel("Cluster")
plt.ylabel("Average Value")
plt.grid(True)
plt.tight_layout()
plt.show()
<Figure size 1400x1000 with 0 Axes>
No description has been provided for this image
In [174]:
# Create a box plot to show the distribution of each feature across clusters
features = ["poverty_below", "housing_renter", "bachelors_deg", "health_ins_total", "population", "housing_total", "renter_rate", "medhhinc", "percapita_income"]

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features, 1):
    plt.subplot(3, 3, i)  # Arrange the boxplots in a 3x3 grid
    sns.boxplot(x="Cluster", y=feature, data=df)
    plt.title(f"Boxplot of {feature} by Cluster")
    plt.tight_layout()

plt.show()
No description has been provided for this image
In [175]:
import seaborn as sns

# Plot a heatmap to show cluster averages
plt.figure(figsize=(10, 8))
sns.heatmap(cluster_means, annot=True, cmap="coolwarm", fmt=".2f", cbar=True)
plt.title("Cluster Variable Averages (Heatmap)")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [178]:
import folium
import pandas as pd

# Assuming 'df' has latitude and longitude columns, and 'Cluster' for clustering information

# Create a base map centered on NYC
m = folium.Map(location=[40.7128, -74.0060], zoom_start=11)

# Color map for the clusters
color_map = {0: 'blue', 1: 'green', 2: 'red', 3: 'purple'}

# Plot the clusters on the map using CircleMarkers
for i, row in df.iterrows():
    folium.CircleMarker(
        location=[row["latitude"], row["longitude"]],  # Ensure you have latitude and longitude columns
        radius=5,
        color=color_map[row["Cluster"]],  # Color based on the cluster label
        fill=True,
        fill_color=color_map[row["Cluster"]],
        fill_opacity=0.7
    ).add_to(m)

# Save the map to an HTML file
m.save("nyc_clusters_map.html")

# Display the map (for Jupyter/Colab)
display(m)
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
'''	•	Cluster 0: Represents areas with moderate to high poverty, moderate rent, and middle-income neighborhoods.
	•	Cluster 1: Areas with moderate socio-economic conditions, both in terms of poverty and housing rent, with middle-income levels.
	•	Cluster 2: Affluent neighborhoods with low poverty and high incomes, but with low education and health insurance coverage.
	•	Cluster 3: Low-income, large population areas with high poverty and high renter rates, but low income overall.'''
In [ ]: