In [9]:
Out[9]:
In [1]:
# Essential import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# this is sometimes helpful to suppress scientific notation in the display of arrays
np.set_printoptions(suppress=True)
In [1]:
#import Census package
from census import Census
import time
In [2]:
from census import Census
import pandas as pd

# Replace with your Census API Key
API_KEY = "KEY"

# Initialize Census API client (ACS 5-Year, 2023)
c = Census(API_KEY, year=2023)

# Define the Census variables to pull (Expanded Data including new requests)
variables = [
    "NAME",
    
    # Population & Demographics
    "B02001_001E",  # Total population
    
    # Income & Poverty
    "B19013_001E",  # Median household income
    "B19301_001E",  # Per capita income
    "B17021_001E",  # Total population for poverty status
    "B17021_002E",  # Population below poverty level

    # Housing
    "B25003_001E",  # Total housing units
    "B25003_002E",  # Owner-occupied housing
    "B25003_003E",  # Renter-occupied housing

    # Immigration
    "B05002_001E",  # Total population for nativity
    "B05002_013E",  # Foreign-born population

    # Education
    "B15003_001E",  # Total population 25+ for education determination
    "B15003_017E",  # High school graduate (includes equivalency)
    "B15003_018E",  # Some college, less than 1 year
    "B15003_019E",  # Some college, 1+ years, no degree
    "B15003_020E",  # Associate's degree
    "B15003_021E",  # Bachelor's degree
    "B15003_022E",  # Master's degree
    "B15003_023E",  # Professional school degree
    "B15003_024E",  # Doctorate degree
    "B14001_001E",  # Total school enrollment
    "B14001_002E",  # Enrolled in school (Nursery to 12th grade)
    "B14001_008E",  # Enrolled in college or graduate school

    # Health & Disability
    "B18135_001E",  # Total population with a disability
    "B18135_007E",  # People under 18 with a disability
    "B27001_001E",  # Total population for health insurance
    "B27001_005E",  # Males 18-24 with health insurance
    "B27001_033E",  # Females 18-24 with health insurance
]

# Define geography (Florida, FIPS 12)
geo_filter = {"for": "tract:*", "in": "state:12 county:*"}

# Pull data
print("Pulling 2023 ACS Data for Florida...")
data = c.acs5.get(variables, geo_filter)

# Convert to DataFrame
df = pd.DataFrame(data)

# Rename columns for clarity
df.rename(columns={
    "NAME": "census_tract",
    
    # Population & Demographics
    "B02001_001E": "total_population",

    # Income & Poverty
    "B19013_001E": "median_household_income",
    "B19301_001E": "per_capita_income",
    "B17021_001E": "poverty_population",
    "B17021_002E": "poverty_below_population",

    # Housing
    "B25003_001E": "total_housing_units",
    "B25003_002E": "owner_occupied_units",
    "B25003_003E": "renter_occupied_units",

    # Immigration
    "B05002_001E": "total_nativity_population",
    "B05002_013E": "foreign_born_population",

    # Education
    "B15003_001E": "total_population_25plus",
    "B15003_017E": "high_school_graduate",
    "B15003_018E": "some_college_less_than_1_year",
    "B15003_019E": "some_college_more_than_1_year",
    "B15003_020E": "associates_degree",
    "B15003_021E": "bachelors_degree",
    "B15003_022E": "masters_degree",
    "B15003_023E": "professional_school_degree",
    "B15003_024E": "doctorate_degree",
    "B14001_001E": "total_school_enrollment",
    "B14001_002E": "nursery_to_high_school_enrollment",
    "B14001_008E": "college_or_grad_school_enrollment",

    # Health & Disability
    "B18135_001E": "total_population_with_disability",
    "B18135_007E": "disability_under_18",
    "B27001_001E": "total_population_with_health_insurance",
    "B27001_005E": "males_18_24_with_health_insurance",
    "B27001_033E": "females_18_24_with_health_insurance",
}, inplace=True)

# Save to CSV (Renamed Data)
df.to_csv("acs_2023_florida_renamed.csv", index=False)
print("Data saved as acs_2023_florida_renamed.csv")

# Display sample rows
print(df.sample(5))
Pulling 2023 ACS Data for Florida...
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[2], line 59
     57 # Pull data
     58 print("Pulling 2023 ACS Data for Florida...")
---> 59 data = c.acs5.get(variables, geo_filter)
     61 # Convert to DataFrame
     62 df = pd.DataFrame(data)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:325, in ACSClient.get(self, *args, **kwargs)
    322 def get(self, *args, **kwargs):
    323     self._switch_endpoints(kwargs.get('year', self.default_year))
--> 325     return super(ACSClient, self).get(*args, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:159, in Client.get(self, fields, geo, year, **kwargs)
    156 sort_by_geoid = len(fields) > 49 and (not year or year > 2009)
    157 all_results = (self.query(forty_nine_fields, geo, year, sort_by_geoid=sort_by_geoid, **kwargs)
    158                for forty_nine_fields in chunks(fields, 49))
--> 159 merged_results = [merge(result) for result in zip(*all_results)]
    161 return merged_results

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:157, in <genexpr>(.0)
    150 """
    151 The API only accepts up to 50 fields on each query.
    152 Chunk requests, and use the unique GEO_ID to match up the chunks
    153 in case the responses are in different orders.
    154 GEO_ID is not reliably present in pre-2010 requests.
    155 """
    156 sort_by_geoid = len(fields) > 49 and (not year or year > 2009)
--> 157 all_results = (self.query(forty_nine_fields, geo, year, sort_by_geoid=sort_by_geoid, **kwargs)
    158                for forty_nine_fields in chunks(fields, 49))
    159 merged_results = [merge(result) for result in zip(*all_results)]
    161 return merged_results

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:60, in retry_on_transient_error.<locals>.wrapper(self, *args, **kwargs)
     58 for _ in range(max(self.retries - 1, 0)):
     59     try:
---> 60         result = func(self, *args, **kwargs)
     61     except CensusException as e:
     62         if "There was an error while running your query.  We've logged the error and we'll correct it ASAP.  Sorry for the inconvenience." in str(e):

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:198, in Client.query(self, fields, geo, year, sort_by_geoid, **kwargs)
    195         raise ex
    197 headers = data.pop(0)
--> 198 types = [self._field_type(header, year) for header in headers]
    199 results = [{header: (cast(item) if item is not None else None)
    200             for header, cast, item
    201             in zip(headers, types, d)}
    202            for d in data]
    203 if sort_by_geoid:

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:198, in <listcomp>(.0)
    195         raise ex
    197 headers = data.pop(0)
--> 198 types = [self._field_type(header, year) for header in headers]
    199 results = [{header: (cast(item) if item is not None else None)
    200             for header, cast, item
    201             in zip(headers, types, d)}
    202            for d in data]
    203 if sort_by_geoid:

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:219, in Client._field_type(self, field, year)
    216 @lru_cache(maxsize=1024)
    217 def _field_type(self, field, year):
    218     url = self.definition_url % (year, self.dataset, field)
--> 219     resp = self.session.get(url)
    221     types = {"fips-for": str,
    222              "fips-in": str,
    223              "int": float_or_str,
    224              "long": float_or_str,
    225              "float": float,
    226              "string": str}
    228     if resp.status_code == 200:

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/sessions.py:602, in Session.get(self, url, **kwargs)
    594 r"""Sends a GET request. Returns :class:`Response` object.
    595 
    596 :param url: URL for the new :class:`Request` object.
    597 :param \*\*kwargs: Optional arguments that ``request`` takes.
    598 :rtype: requests.Response
    599 """
    601 kwargs.setdefault("allow_redirects", True)
--> 602 return self.request("GET", url, **kwargs)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    584 send_kwargs = {
    585     "timeout": timeout,
    586     "allow_redirects": allow_redirects,
    587 }
    588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
    591 return resp

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
    700 start = preferred_clock()
    702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
    705 # Total elapsed time of the request (approximately)
    706 elapsed = preferred_clock() - start

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/adapters.py:667, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
    664     timeout = TimeoutSauce(connect=timeout, read=timeout)
    666 try:
--> 667     resp = conn.urlopen(
    668         method=request.method,
    669         url=url,
    670         body=request.body,
    671         headers=request.headers,
    672         redirect=False,
    673         assert_same_host=False,
    674         preload_content=False,
    675         decode_content=False,
    676         retries=self.max_retries,
    677         timeout=timeout,
    678         chunked=chunked,
    679     )
    681 except (ProtocolError, OSError) as err:
    682     raise ConnectionError(err, request=request)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:787, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)
    784 response_conn = conn if not release_conn else None
    786 # Make the request on the HTTPConnection object
--> 787 response = self._make_request(
    788     conn,
    789     method,
    790     url,
    791     timeout=timeout_obj,
    792     body=body,
    793     headers=headers,
    794     chunked=chunked,
    795     retries=retries,
    796     response_conn=response_conn,
    797     preload_content=preload_content,
    798     decode_content=decode_content,
    799     **response_kw,
    800 )
    802 # Everything went great!
    803 clean_exit = True

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:534, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)
    532 # Receive the response from the server
    533 try:
--> 534     response = conn.getresponse()
    535 except (BaseSSLError, OSError) as e:
    536     self._raise_timeout(err=e, url=url, timeout_value=read_timeout)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connection.py:516, in HTTPConnection.getresponse(self)
    513 _shutdown = getattr(self.sock, "shutdown", None)
    515 # Get the response from http.client.HTTPConnection
--> 516 httplib_response = super().getresponse()
    518 try:
    519     assert_header_parsing(httplib_response.msg)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py:1395, in HTTPConnection.getresponse(self)
   1393 try:
   1394     try:
-> 1395         response.begin()
   1396     except ConnectionError:
   1397         self.close()

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py:325, in HTTPResponse.begin(self)
    323 # read until we get a non-100 response
    324 while True:
--> 325     version, status, reason = self._read_status()
    326     if status != CONTINUE:
    327         break

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py:286, in HTTPResponse._read_status(self)
    285 def _read_status(self):
--> 286     line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    287     if len(line) > _MAXLINE:
    288         raise LineTooLong("status line")

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/socket.py:706, in SocketIO.readinto(self, b)
    704 while True:
    705     try:
--> 706         return self._sock.recv_into(b)
    707     except timeout:
    708         self._timeout_occurred = True

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py:1314, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1310     if flags != 0:
   1311         raise ValueError(
   1312           "non-zero flags not allowed in calls to recv_into() on %s" %
   1313           self.__class__)
-> 1314     return self.read(nbytes, buffer)
   1315 else:
   1316     return super().recv_into(buffer, nbytes, flags)

File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py:1166, in SSLSocket.read(self, len, buffer)
   1164 try:
   1165     if buffer is not None:
-> 1166         return self._sslobj.read(len, buffer)
   1167     else:
   1168         return self._sslobj.read(len)

KeyboardInterrupt: 
In [33]:
# Load and verify the Census data file that we previously created using the API
florida_df = pd.read_csv('acs_2023_florida_renamed.csv') 

#############
# What are some ways to preview this dataset? 


# Display basic dataset information
print("Dataset Info:")
florida_df.info()

# Display the first few rows of the dataset
print("\nPreview of the dataset:")
florida_df.head()
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5160 entries, 0 to 5159
Data columns (total 31 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   census_tract                            5160 non-null   object 
 1   total_population                        5160 non-null   float64
 2   median_household_income                 5160 non-null   float64
 3   per_capita_income                       5158 non-null   float64
 4   poverty_population                      5160 non-null   float64
 5   poverty_below_population                5160 non-null   float64
 6   total_housing_units                     5160 non-null   float64
 7   owner_occupied_units                    5160 non-null   float64
 8   renter_occupied_units                   5160 non-null   float64
 9   total_nativity_population               5160 non-null   float64
 10  foreign_born_population                 5160 non-null   float64
 11  total_population_25plus                 5160 non-null   float64
 12  high_school_graduate                    5160 non-null   float64
 13  some_college_less_than_1_year           5160 non-null   float64
 14  some_college_more_than_1_year           5160 non-null   float64
 15  associates_degree                       5160 non-null   float64
 16  bachelors_degree                        5160 non-null   float64
 17  masters_degree                          5160 non-null   float64
 18  professional_school_degree              5160 non-null   float64
 19  doctorate_degree                        5160 non-null   float64
 20  total_school_enrollment                 5160 non-null   float64
 21  nursery_to_high_school_enrollment       5160 non-null   float64
 22  college_or_grad_school_enrollment       5160 non-null   float64
 23  total_population_with_disability        5160 non-null   float64
 24  disability_under_18                     5160 non-null   float64
 25  total_population_with_health_insurance  5160 non-null   float64
 26  males_18_24_with_health_insurance       5160 non-null   float64
 27  females_18_24_with_health_insurance     5160 non-null   float64
 28  state                                   5160 non-null   int64  
 29  county                                  5160 non-null   int64  
 30  tract                                   5160 non-null   int64  
dtypes: float64(27), int64(3), object(1)
memory usage: 1.2+ MB

Preview of the dataset:
Out[33]:
census_tract total_population median_household_income per_capita_income poverty_population poverty_below_population total_housing_units owner_occupied_units renter_occupied_units total_nativity_population ... nursery_to_high_school_enrollment college_or_grad_school_enrollment total_population_with_disability disability_under_18 total_population_with_health_insurance males_18_24_with_health_insurance females_18_24_with_health_insurance state county tract
0 Census Tract 2.01; Alachua County; Florida 5187.0 18657.0 16690.0 5006.0 3239.0 2319.0 153.0 2166.0 5187.0 ... 3987.0 3099.0 5185.0 13.0 5185.0 0.0 0.0 12 1 201
1 Census Tract 2.02; Alachua County; Florida 5897.0 17609.0 11493.0 4509.0 3069.0 1897.0 247.0 1650.0 5897.0 ... 4878.0 4152.0 5897.0 0.0 5897.0 0.0 0.0 12 1 202
2 Census Tract 3.01; Alachua County; Florida 3703.0 47813.0 28654.0 3703.0 1186.0 1855.0 427.0 1428.0 3703.0 ... 1429.0 778.0 3703.0 0.0 3703.0 0.0 0.0 12 1 301
3 Census Tract 3.02; Alachua County; Florida 2500.0 39583.0 25978.0 2500.0 394.0 1255.0 603.0 652.0 2500.0 ... 393.0 20.0 2500.0 0.0 2500.0 0.0 0.0 12 1 302
4 Census Tract 4; Alachua County; Florida 5736.0 51266.0 27362.0 5736.0 1075.0 2414.0 1297.0 1117.0 5736.0 ... 1637.0 336.0 5736.0 0.0 5736.0 0.0 0.0 12 1 400

5 rows × 31 columns

In [34]:
# Display column names
print("Columns in the dataset:")
print(florida_df.columns)

# Show summary statistics for numerical columns
print("\nSummary Statistics:")
florida_df.describe()
Columns in the dataset:
Index(['census_tract', 'total_population', 'median_household_income',
       'per_capita_income', 'poverty_population', 'poverty_below_population',
       'total_housing_units', 'owner_occupied_units', 'renter_occupied_units',
       'total_nativity_population', 'foreign_born_population',
       'total_population_25plus', 'high_school_graduate',
       'some_college_less_than_1_year', 'some_college_more_than_1_year',
       'associates_degree', 'bachelors_degree', 'masters_degree',
       'professional_school_degree', 'doctorate_degree',
       'total_school_enrollment', 'nursery_to_high_school_enrollment',
       'college_or_grad_school_enrollment', 'total_population_with_disability',
       'disability_under_18', 'total_population_with_health_insurance',
       'males_18_24_with_health_insurance',
       'females_18_24_with_health_insurance', 'state', 'county', 'tract'],
      dtype='object')

Summary Statistics:
Out[34]:
total_population median_household_income per_capita_income poverty_population poverty_below_population total_housing_units owner_occupied_units renter_occupied_units total_nativity_population foreign_born_population ... nursery_to_high_school_enrollment college_or_grad_school_enrollment total_population_with_disability disability_under_18 total_population_with_health_insurance males_18_24_with_health_insurance females_18_24_with_health_insurance state county tract
count 5160.00000 5.160000e+03 5.158000e+03 5160.000000 5160.000000 5160.000000 5160.000000 5160.000000 5160.00000 5160.000000 ... 5160.000000 5160.000000 5160.000000 5160.000000 5160.000000 5160.000000 5160.000000 5160.0 5160.000000 5160.000000
mean 4249.78314 -1.374931e+07 -1.016864e+07 4158.910465 524.747674 1657.153295 1115.660659 541.492636 4249.78314 909.771124 ... 933.263566 207.125388 4184.416279 2.440504 4184.416279 7.444380 7.454845 12.0 71.692442 87157.662597
std 2174.55402 9.502055e+07 8.188418e+07 2169.989070 436.279490 786.672294 679.156836 464.933551 2174.55402 915.272246 ... 746.118635 395.274525 2167.695027 11.741644 2167.695027 23.035979 22.739017 0.0 35.938525 229403.987266
min 0.00000 -6.666667e+08 -6.666667e+08 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 12.0 1.000000 100.000000
25% 2811.50000 5.323475e+04 2.767100e+04 2730.500000 216.000000 1136.000000 658.000000 202.000000 2811.50000 251.000000 ... 452.750000 65.000000 2747.750000 0.000000 2747.750000 0.000000 0.000000 12.0 39.000000 6402.750000
50% 3973.00000 6.991300e+04 3.642950e+04 3885.000000 407.000000 1561.500000 1021.500000 413.000000 3973.00000 571.000000 ... 794.500000 139.000000 3904.000000 0.000000 3904.000000 0.000000 0.000000 12.0 86.000000 14405.000000
75% 5377.00000 9.137600e+04 4.956750e+04 5259.000000 719.250000 2081.250000 1463.250000 747.000000 5377.00000 1285.250000 ... 1228.000000 248.000000 5291.000000 0.000000 5291.000000 0.000000 0.000000 12.0 99.000000 40202.000000
max 24659.00000 2.500010e+05 3.007780e+05 24659.000000 3303.000000 7964.000000 7238.000000 3745.000000 24659.00000 7585.000000 ... 13134.000000 13104.000000 24659.000000 299.000000 24659.000000 370.000000 583.000000 12.0 133.000000 990200.000000

8 rows × 30 columns

In [35]:
# Define a variable we can use to select different input features for the analysis
# I keep a few different sets here using comments to choose one at a time

cluster_df = florida_df[['median_household_income', 'poverty_population',  
        'foreign_born_population', 'high_school_graduate', 'bachelors_degree', 
 'total_population_with_health_insurance',]]
#############
# After we a run the one above together, you will try your own with four variables. 
#############

# Preview our selected data so we can see what will be clustered
display(cluster_df)
median_household_income poverty_population foreign_born_population high_school_graduate bachelors_degree total_population_with_health_insurance
0 18657.0 5006.0 715.0 75.0 178.0 5185.0
1 17609.0 4509.0 226.0 95.0 78.0 5897.0
2 47813.0 3703.0 213.0 442.0 131.0 3703.0
3 39583.0 2500.0 199.0 421.0 206.0 2500.0
4 51266.0 5736.0 263.0 848.0 170.0 5736.0
... ... ... ... ... ... ...
5155 45469.0 2342.0 69.0 570.0 127.0 2434.0
5156 53220.0 3208.0 27.0 901.0 127.0 3157.0
5157 58099.0 2671.0 27.0 682.0 98.0 2712.0
5158 56324.0 4798.0 300.0 1790.0 353.0 4798.0
5159 48428.0 4086.0 71.0 993.0 297.0 4104.0

5160 rows × 6 columns

In [37]:
# Scale our features data to range between 0 and 1
# To do so we use the MinMaxScaler
# Note that this outputs a numpy array
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Transform the data
X = scaler.fit_transform(cluster_df)
display(pd.DataFrame(X,columns=cluster_df.columns))
median_household_income poverty_population foreign_born_population high_school_graduate bachelors_degree total_population_with_health_insurance
0 0.999653 0.203009 0.094265 0.020309 0.078380 0.210268
1 0.999652 0.182854 0.029796 0.025724 0.034346 0.239142
2 0.999697 0.150168 0.028082 0.119686 0.057684 0.150168
3 0.999684 0.101383 0.026236 0.113999 0.090709 0.101383
4 0.999702 0.232613 0.034674 0.229624 0.074857 0.232613
... ... ... ... ... ... ...
5155 0.999693 0.094975 0.009097 0.154346 0.055923 0.098706
5156 0.999705 0.130094 0.003560 0.243975 0.055923 0.128026
5157 0.999712 0.108317 0.003560 0.184674 0.043153 0.109980
5158 0.999710 0.194574 0.039552 0.484701 0.155438 0.194574
5159 0.999698 0.165700 0.009361 0.268887 0.130779 0.166430

5160 rows × 6 columns

In [38]:
#this code is going to tell us how many clusters we should use in our PCA

from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Ensure X is a Pandas DataFrame
if isinstance(X, np.ndarray):
    X = pd.DataFrame(X)

# Ensure X contains only numerical data and no missing values
X = X.dropna()  # Drop missing values
X = X.select_dtypes(include=[np.number])  # Keep only numeric columns

# Initialize list to store inertia values
inertias = []

# Loop through possible cluster sizes
for i in range(2, 11):  # Start at 2 clusters
    kmeans_elbow = KMeans(n_clusters=i, n_init="auto", random_state=42)
    kmeans_elbow.fit(X)
    inertias.append(kmeans_elbow.inertia_)

# Plot the Elbow Method
plt.plot(range(2, 11), inertias, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()
No description has been provided for this image
In [39]:
'''	1.	Steep Drop Until K=4
	•	The inertia decreases rapidly from K=2 to K=4, meaning adding more clusters significantly improves the fit (reduces variance within clusters).
	•	This suggests that 4 clusters capture major structure in the data.
	2.	Gradual Decrease After K=4
	•	Beyond K=4, the improvement slows down—indicating diminishing returns.
	•	This suggests that additional clusters don’t add much new structure but instead refine existing groups.
	3.	Elbow Point at K=4
	•	The elbow (where the curve starts flattening) appears around K=4.
	•	This suggests that 4 clusters might be an optimal choice, balancing simplicity and accuracy.


Since you used many features (education, income, poverty, housing, health, immigration):
	•	Each cluster likely represents distinct socio-economic groups in Florida.
	•	K=4 means we have 4 dominant socio-economic patterns based on the input features.

Possible Cluster Interpretations (Hypothesis)
	•	Cluster 1: High-income, highly-educated, high homeownership areas.
	•	Cluster 2: Middle-income, mixed education, moderate homeownership.
	•	Cluster 3: Lower-income, lower education, high rental population.
	•	Cluster 4: High foreign-born, lower median income, variable education.'''
Out[39]:
'\t1.\tSteep Drop Until K=4\n\t•\tThe inertia decreases rapidly from K=2 to K=4, meaning adding more clusters significantly improves the fit (reduces variance within clusters).\n\t•\tThis suggests that 4 clusters capture major structure in the data.\n\t2.\tGradual Decrease After K=4\n\t•\tBeyond K=4, the improvement slows down—indicating diminishing returns.\n\t•\tThis suggests that additional clusters don’t add much new structure but instead refine existing groups.\n\t3.\tElbow Point at K=4\n\t•\tThe elbow (where the curve starts flattening) appears around K=4.\n\t•\tThis suggests that 4 clusters might be an optimal choice, balancing simplicity and accuracy.\n\n\nSince you used many features (education, income, poverty, housing, health, immigration):\n\t•\tEach cluster likely represents distinct socio-economic groups in Florida.\n\t•\tK=4 means we have 4 dominant socio-economic patterns based on the input features.\n\nPossible Cluster Interpretations (Hypothesis)\n\t•\tCluster 1: High-income, highly-educated, high homeownership areas.\n\t•\tCluster 2: Middle-income, mixed education, moderate homeownership.\n\t•\tCluster 3: Lower-income, lower education, high rental population.\n\t•\tCluster 4: High foreign-born, lower median income, variable education.'
In [40]:
# Conduct the K-means analysis
# First defining a variable to control number of k categories
n_k = 4
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=n_k, random_state=0, n_init="auto").fit(X)
# Display out cluster center means
display(pd.DataFrame(np.round(kmeans_model.cluster_centers_, decimals=4),columns=cluster_df.columns))
median_household_income poverty_population foreign_born_population high_school_graduate bachelors_degree total_population_with_health_insurance
0 0.9997 0.3078 0.3113 0.3522 0.2592 0.3081
1 0.0000 0.0059 0.0125 0.0147 0.0060 0.0224
2 0.9997 0.1098 0.0606 0.1172 0.0838 0.1109
3 0.9997 0.1981 0.1294 0.2375 0.1626 0.1985
In [ ]:
'''The code focuses on applying PCA to a dataset to reduce the dimensionality and identify the most significant features contributing to the variability in the data. Principle components are linear combinations of the original features, capturing the most significant variance in the dataset.
PCA is used to calculate how much variance is explained by each of the principal components. 
The first principal component (PC1) explains 60.6% of the variance, while the second principal component (PC2) explains 15.2%. 
Together, they capture 75.8% of the total variation in the dataset.
Cluster 1 (Purple): Likely represents low-income areas, with lower education and higher poverty levels. 
Cluster 2 (Green): Likely represents higher-income areas with higher education levels. Cluster 3 (Yellow): Potential outliers or special
cases, such as urban centers or areas with unique socio-economic traits.
'''
In [41]:
# Predict and visualize the clusters using pairs of features
# First, make the predictions
y_label = kmeans_model.fit_predict(X)  # Cluster assignments

# Convert X to DataFrame if it's not already one
if isinstance(X, np.ndarray):
    X = pd.DataFrame(X, columns=cluster_df.columns)

# Set feature columns for visualization
column_x = 1  # Choose a valid feature column index
column_y = 2  # Choose another valid feature column index

# Ensure column indices are within bounds
if column_x >= X.shape[1] or column_y >= X.shape[1]:
    raise ValueError("Column indices out of bounds. Check your column selection.")

# Scatter plot using clustering results
plt.scatter(X.iloc[:, column_x], X.iloc[:, column_y], c=y_label, cmap="viridis", s=20)

# Label axes correctly
plt.xlabel(X.columns[column_x])
plt.ylabel(X.columns[column_y])
plt.title("Cluster Visualization")
plt.colorbar(label="Cluster")

plt.show()
No description has been provided for this image
In [5]:
'''Above: The plot shows how the data points (likely census tracts or similar regions) cluster together based on their principal components. The colors represent different clusters. The separation in the PCA space indicates that the data points group into distinct clusters.

Cluster 1 (Purple): Likely represents low-income areas, with lower education and higher poverty levels. Cluster 2 (Green): Likely represents higher-income areas with higher education levels. Cluster 3 (Yellow): Potential outliers or special cases, such as urban centers or areas with unique socio-economic traits.'''
Out[5]:
'Above: The plot shows how the data points (likely census tracts or similar regions) cluster together based on their principal components. The colors represent different clusters. The separation in the PCA space indicates that the data points group into distinct clusters.\n\nCluster 1 (Purple): Likely represents low-income areas, with lower education and higher poverty levels. Cluster 2 (Green): Likely represents higher-income areas with higher education levels. Cluster 3 (Yellow): Potential outliers or special cases, such as urban centers or areas with unique socio-economic traits.'
In [42]:
# Convert y_label to a Pandas Series if it's not already
if isinstance(y_label, np.ndarray):
    y_label = pd.Series(y_label, index=X.index)

# Then rerun the scatter plot
for i in range(n_k):
    plt.scatter(X.loc[y_label == i, column_x], X.loc[y_label == i, column_y], s=2, label='Cluster '+str(i))

plt.xlabel(cluster_df.columns[column_x])
plt.ylabel(cluster_df.columns[column_y])
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image
In [43]:
# Visualize the tree (dendrogram) for this data
from sklearn.cluster import AgglomerativeClustering
agglom_model = AgglomerativeClustering(distance_threshold=0,n_clusters=None)
agglom_model = agglom_model.fit(X)

from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agglom_model, truncate_mode="level", p=5)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
No description has been provided for this image
In [3]:
'''above: In essence, the dendrogram shows how data points or objects are hierarchically grouped based on similarity, with each successive level of merging representing a broader grouping of data points. By choosing a particular cut-off on the y-axis, you can determine how many clusters you want to identify in your data.

This technique is useful for identifying patterns, relationships, and the number of natural clusters present in the data. Would you like to dive deeper into how hierarchical clustering was applied in this context, or would you like help interpreting any specific part of the dendrogram?'''
Out[3]:
'above: In essence, the dendrogram shows how data points or objects are hierarchically grouped based on similarity, with each successive level of merging representing a broader grouping of data points. By choosing a particular cut-off on the y-axis, you can determine how many clusters you want to identify in your data.\n\nThis technique is useful for identifying patterns, relationships, and the number of natural clusters present in the data. Would you like to dive deeper into how hierarchical clustering was applied in this context, or would you like help interpreting any specific part of the dendrogram?'
In [1]:
'''Below: Principal Component Analysis (PCA) is a statistical technique used for dimensionality reduction while preserving as much
variability (information) as possible in the data. It transforms a large set of possibly correlated variables into a smaller set of
uncorrelated variables known as principal components. These components are linear combinations of the original variables and 
capture the most significant variance in the data.
'''
Out[1]:
'Below: Principal Component Analysis (PCA) is a statistical technique used for dimensionality reduction while preserving as much\nvariability (information) as possible in the data. It transforms a large set of possibly correlated variables into a smaller set of\nuncorrelated variables known as principal components. These components are linear combinations of the original variables and \ncapture the most significant variance in the data.\n'
In [44]:
# Import necessary libraries
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Step 1: Standardize the data (excluding non-numeric columns)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(cluster_df)  # Ensure `cluster_df` contains only numerical features

# Step 2: Apply PCA to reduce dimensions
pca = PCA(n_components=2)  # Reduce to 2 principal components
X_pca = pca.fit_transform(X_scaled)

# Step 3: Convert PCA result into a DataFrame
pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
pca_df["cluster"] = y_label  # Attach cluster labels

# Step 4: Check explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

# Step 5: Re-run KMeans on PCA-transformed data
kmeans_pca = KMeans(n_clusters=4, random_state=42, n_init="auto")
pca_df["cluster_pca"] = kmeans_pca.fit_predict(X_pca)

# Step 6: Visualize Clusters in PCA space
plt.figure(figsize=(10,6))
sns.scatterplot(x=pca_df["PC1"], y=pca_df["PC2"], hue=pca_df["cluster_pca"], palette="viridis", s=50)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Clusters in PCA-Transformed Space")
plt.legend(title="Cluster (PCA)")
plt.show()
Explained variance ratio: [0.60693438 0.15239682]
No description has been provided for this image
In [45]:
print(X.columns)  # This will show the order of original features before PCA
Index([0, 1, 2, 3, 4, 5], dtype='int64')
In [59]:
""" 
Understanding PCA Variance Explained
	•	The first principal component (PC1) explains 60.69% of the variance.
	•	The second principal component (PC2) explains 15.23% of the variance.
	•	Together, they capture ~76% of the total variation in the dataset.

 What This Means:
	•	PC1 is likely capturing the dominant socio-economic factors that differentiate census tracts (e.g., income, education levels, and housing status).
	•	PC2 adds additional variation but has a weaker influence compared to PC1.
 Cluster Formation in PCA Space
	•	The clusters are well-separated, which means K-Means was able to group census tracts into meaningful categories.
	•	The yellow cluster (Cluster 3) is distinct from all others, suggesting it represents an outlier group with very different characteristics (e.g., very high/low income, foreign-born population, or housing differences).
	•	The other clusters (purple, blue, green) are spread along PC1, indicating they are mostly structured around the primary factor (likely income/education).
 Possible Socio-Economic Interpretation:
	•	Cluster 0 (Purple): Likely lower-income, lower education, higher poverty.
	•	Cluster 1 (Blue): Middle-income, moderate education levels.
	•	Cluster 2 (Green): Higher-income, higher education levels.
	•	Cluster 3 (Yellow): Outlier group (possibly wealthy urban areas or low-population rural areas).
  Cell In[59], line 1
    """
    ^
SyntaxError: incomplete input
In [47]:
# Fit PCA again if not already done
pca = PCA(n_components=2)
pca.fit(X)  # X is the original dataset (scaled and numerical)

# Get the loadings (contributions) of each feature to PC1 and PC2
loadings = pd.DataFrame(pca.components_, columns=X.columns, index=['PC1', 'PC2'])

# Transpose for readability
loadings = loadings.T

# Sort the absolute values of contributions for better visualization
loadings['Abs_PC1'] = loadings['PC1'].abs()
loadings['Abs_PC2'] = loadings['PC2'].abs()

# Sort by importance
pc1_top_features = loadings.sort_values(by='Abs_PC1', ascending=False)[['PC1']]
pc2_top_features = loadings.sort_values(by='Abs_PC2', ascending=False)[['PC2']]

# Print top contributing features
print("\n📌 **Top Variables Contributing to PC1 (Primary Factor)**")
print(pc1_top_features.head(10))

print("\n📌 **Top Variables Contributing to PC2 (Secondary Factor)**")
print(pc2_top_features.head(10))

# 🔥 Heatmap Visualization of Contributions
plt.figure(figsize=(10, 6))
sns.heatmap(loadings[['PC1', 'PC2']], cmap="coolwarm", annot=True, fmt=".2f", linewidths=0.5)
plt.title("Feature Contributions to Principal Components")
plt.xlabel("Principal Components")
plt.ylabel("Original Features")
plt.show()
📌 **Top Variables Contributing to PC1 (Primary Factor)**
        PC1
3  0.478107
1  0.414736
5  0.408578
2  0.398959
0  0.370244
4  0.369085

📌 **Top Variables Contributing to PC2 (Secondary Factor)**
        PC2
0  0.923924
2 -0.250887
3 -0.158334
5 -0.149715
1 -0.135093
4 -0.132991
No description has been provided for this image
In [7]:
''' What This Chart Tells Us
	•	The top table lists the variables that contribute most to PC1 (Primary Factor) and PC2 (Secondary Factor).
	•	The heatmap shows how strongly each original variable correlates with PC1 and PC2 (positive in red, negative in blue).
	•	Higher absolute values in the table/heatmap mean those features are the most influential in defining each principal component.

 Understanding PC1 (Primary Factor)
	•	PC1 has high positive contributions from certain variables (e.g., 0.478107, 0.414736, 0.408578, etc.).
	•	This means these variables account for most of the variance in the dataset.
	•	From the variance ratio (60.7%), PC1 is explaining most of the structure in the dataset.
→ We need to check which variables correspond to these values.

Understanding PC2 (Secondary Factor)
	•	PC2 has one high positive contributor (0.923924) and some negative contributors (-0.250887, -0.158334, etc.).
	•	PC2 is explaining 15.2% of the variance, meaning it’s a less dominant factor than PC1 but still captures important secondary relationships.
	•	Since PC2 has mixed positive and negative values, it likely contrasts two different types of variables.
    
    
    
    High Contributions Mean:
Key Drivers of Socio-Economic Structure: Since PC1 is capturing the most variance, the features with the highest loadings
(income, education, housing) are typically the major drivers of socio-economic conditions in the dataset. These features help
define the broad economic landscape of the population being studied.
Meaningful Variation: When PC1 has high contributions from features like income, education, and housing, it suggests that these features
are strongly correlated and vary together across the dataset. Essentially, the first principal component might represent a general 
“economic development” axis, where higher income, education, and housing correlate with more affluent, better-educated, and 
well-housed areas.
'''
Out[7]:
' What This Chart Tells Us\n\t•\tThe top table lists the variables that contribute most to PC1 (Primary Factor) and PC2 (Secondary Factor).\n\t•\tThe heatmap shows how strongly each original variable correlates with PC1 and PC2 (positive in red, negative in blue).\n\t•\tHigher absolute values in the table/heatmap mean those features are the most influential in defining each principal component.\n\n Understanding PC1 (Primary Factor)\n\t•\tPC1 has high positive contributions from certain variables (e.g., 0.478107, 0.414736, 0.408578, etc.).\n\t•\tThis means these variables account for most of the variance in the dataset.\n\t•\tFrom the variance ratio (60.7%), PC1 is explaining most of the structure in the dataset.\n→ We need to check which variables correspond to these values.\n\nUnderstanding PC2 (Secondary Factor)\n\t•\tPC2 has one high positive contributor (0.923924) and some negative contributors (-0.250887, -0.158334, etc.).\n\t•\tPC2 is explaining 15.2% of the variance, meaning it’s a less dominant factor than PC1 but still captures important secondary relationships.\n\t•\tSince PC2 has mixed positive and negative values, it likely contrasts two different types of variables.\n    \n    \n    \n    High Contributions Mean:\nKey Drivers of Socio-Economic Structure: Since PC1 is capturing the most variance, the features with the highest loadings\n(income, education, housing) are typically the major drivers of socio-economic conditions in the dataset. These features help\ndefine the broad economic landscape of the population being studied.\nMeaningful Variation: When PC1 has high contributions from features like income, education, and housing, it suggests that these features\nare strongly correlated and vary together across the dataset. Essentially, the first principal component might represent a general \n“economic development” axis, where higher income, education, and housing correlate with more affluent, better-educated, and \nwell-housed areas.\n'
In [2]:
'''Notes for me: Understanding Principal Components:

Principal Components (PCs) are new variables derived from the original data that are linear combinations of the original features. These combinations are constructed in such a way that the first principal component (PC1) explains the most variance in the data, the second principal component (PC2) explains the second most variance, and so on.

Role of PC1 (Primary Component):
	•	Variance Capture: PC1 captures the most significant patterns of variation in the data. The idea is to find a new axis (in the multidimensional space of the original features) along which the data varies the most.
	•	Linear Combination: The principal component is formed as a weighted sum of the original features (variables), where each weight (called a “loading”) indicates the importance of the corresponding feature in defining that component.'''
Out[2]:
'Understanding Principal Components:\n\nPrincipal Components (PCs) are new variables derived from the original data that are linear combinations of the original features. These combinations are constructed in such a way that the first principal component (PC1) explains the most variance in the data, the second principal component (PC2) explains the second most variance, and so on.\n\nRole of PC1 (Primary Component):\n\t•\tVariance Capture: PC1 captures the most significant patterns of variation in the data. The idea is to find a new axis (in the multidimensional space of the original features) along which the data varies the most.\n\t•\tLinear Combination: The principal component is formed as a weighted sum of the original features (variables), where each weight (called a “loading”) indicates the importance of the corresponding feature in defining that component.'
In [51]:
'''So what are we actually analyzing?

We are trying to understand the underlying structure of the data by seeing which features (income, education, housing, etc.) contribute the most to the principal components (PC1, PC2, etc.).

Biggest drivers of what?
  $
The principal components (PC1, PC2, etc.) represent the most important underlying dimensions in the dataset. So, when we analyze which features dominate PC1 & PC2, we are identifying which types of socioeconomic factors best explain variations across different census tracts in Florida.

Key questions we are answering:
	•	PC1 (Primary Component): What is the main factor that differentiates census tracts?
→ Is it income levels, education attainment, housing ownership, or something else?
	•	PC2 (Secondary Component): What is the second biggest differentiator between census tracts?'''
Out[51]:
'So what are we actually analyzing?\n\nWe are trying to understand the underlying structure of the data by seeing which features (income, education, housing, etc.) contribute the most to the principal components (PC1, PC2, etc.).\n\nBiggest drivers of what?\n\nThe principal components (PC1, PC2, etc.) represent the most important underlying dimensions in the dataset. So, when we analyze which features dominate PC1 & PC2, we are identifying which types of socioeconomic factors best explain variations across different census tracts in Florida.\n\nKey questions we are answering:\n\t•\tPC1 (Primary Component): What is the main factor that differentiates census tracts?\n→ Is it income levels, education attainment, housing ownership, or something else?\n\t•\tPC2 (Secondary Component): What is the second biggest differentiator between census tracts?'
In [52]:
pc1_top_features = loadings.sort_values(by="Abs_PC1", ascending=False)[["PC1"]].head(10)
pc2_top_features = loadings.sort_values(by="Abs_PC2", ascending=False)[["PC2"]].head(10)
print(pc1_top_features)  # Should show actual feature names, not numbers
print(pc2_top_features)
        PC1
3  0.478107
1  0.414736
5  0.408578
2  0.398959
0  0.370244
4  0.369085
        PC2
0  0.923924
2 -0.250887
3 -0.158334
5 -0.149715
1 -0.135093
4 -0.132991
In [ ]: