In [9]:
Out[9]:
In [1]:
# Essential import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# this is sometimes helpful to suppress scientific notation in the display of arrays
np.set_printoptions(suppress=True)
In [1]:
#import Census package
from census import Census
import time
In [2]:
from census import Census
import pandas as pd
# Replace with your Census API Key
API_KEY = "KEY"
# Initialize Census API client (ACS 5-Year, 2023)
c = Census(API_KEY, year=2023)
# Define the Census variables to pull (Expanded Data including new requests)
variables = [
"NAME",
# Population & Demographics
"B02001_001E", # Total population
# Income & Poverty
"B19013_001E", # Median household income
"B19301_001E", # Per capita income
"B17021_001E", # Total population for poverty status
"B17021_002E", # Population below poverty level
# Housing
"B25003_001E", # Total housing units
"B25003_002E", # Owner-occupied housing
"B25003_003E", # Renter-occupied housing
# Immigration
"B05002_001E", # Total population for nativity
"B05002_013E", # Foreign-born population
# Education
"B15003_001E", # Total population 25+ for education determination
"B15003_017E", # High school graduate (includes equivalency)
"B15003_018E", # Some college, less than 1 year
"B15003_019E", # Some college, 1+ years, no degree
"B15003_020E", # Associate's degree
"B15003_021E", # Bachelor's degree
"B15003_022E", # Master's degree
"B15003_023E", # Professional school degree
"B15003_024E", # Doctorate degree
"B14001_001E", # Total school enrollment
"B14001_002E", # Enrolled in school (Nursery to 12th grade)
"B14001_008E", # Enrolled in college or graduate school
# Health & Disability
"B18135_001E", # Total population with a disability
"B18135_007E", # People under 18 with a disability
"B27001_001E", # Total population for health insurance
"B27001_005E", # Males 18-24 with health insurance
"B27001_033E", # Females 18-24 with health insurance
]
# Define geography (Florida, FIPS 12)
geo_filter = {"for": "tract:*", "in": "state:12 county:*"}
# Pull data
print("Pulling 2023 ACS Data for Florida...")
data = c.acs5.get(variables, geo_filter)
# Convert to DataFrame
df = pd.DataFrame(data)
# Rename columns for clarity
df.rename(columns={
"NAME": "census_tract",
# Population & Demographics
"B02001_001E": "total_population",
# Income & Poverty
"B19013_001E": "median_household_income",
"B19301_001E": "per_capita_income",
"B17021_001E": "poverty_population",
"B17021_002E": "poverty_below_population",
# Housing
"B25003_001E": "total_housing_units",
"B25003_002E": "owner_occupied_units",
"B25003_003E": "renter_occupied_units",
# Immigration
"B05002_001E": "total_nativity_population",
"B05002_013E": "foreign_born_population",
# Education
"B15003_001E": "total_population_25plus",
"B15003_017E": "high_school_graduate",
"B15003_018E": "some_college_less_than_1_year",
"B15003_019E": "some_college_more_than_1_year",
"B15003_020E": "associates_degree",
"B15003_021E": "bachelors_degree",
"B15003_022E": "masters_degree",
"B15003_023E": "professional_school_degree",
"B15003_024E": "doctorate_degree",
"B14001_001E": "total_school_enrollment",
"B14001_002E": "nursery_to_high_school_enrollment",
"B14001_008E": "college_or_grad_school_enrollment",
# Health & Disability
"B18135_001E": "total_population_with_disability",
"B18135_007E": "disability_under_18",
"B27001_001E": "total_population_with_health_insurance",
"B27001_005E": "males_18_24_with_health_insurance",
"B27001_033E": "females_18_24_with_health_insurance",
}, inplace=True)
# Save to CSV (Renamed Data)
df.to_csv("acs_2023_florida_renamed.csv", index=False)
print("Data saved as acs_2023_florida_renamed.csv")
# Display sample rows
print(df.sample(5))
Pulling 2023 ACS Data for Florida...
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[2], line 59 57 # Pull data 58 print("Pulling 2023 ACS Data for Florida...") ---> 59 data = c.acs5.get(variables, geo_filter) 61 # Convert to DataFrame 62 df = pd.DataFrame(data) File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:325, in ACSClient.get(self, *args, **kwargs) 322 def get(self, *args, **kwargs): 323 self._switch_endpoints(kwargs.get('year', self.default_year)) --> 325 return super(ACSClient, self).get(*args, **kwargs) File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:159, in Client.get(self, fields, geo, year, **kwargs) 156 sort_by_geoid = len(fields) > 49 and (not year or year > 2009) 157 all_results = (self.query(forty_nine_fields, geo, year, sort_by_geoid=sort_by_geoid, **kwargs) 158 for forty_nine_fields in chunks(fields, 49)) --> 159 merged_results = [merge(result) for result in zip(*all_results)] 161 return merged_results File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:157, in <genexpr>(.0) 150 """ 151 The API only accepts up to 50 fields on each query. 152 Chunk requests, and use the unique GEO_ID to match up the chunks 153 in case the responses are in different orders. 154 GEO_ID is not reliably present in pre-2010 requests. 155 """ 156 sort_by_geoid = len(fields) > 49 and (not year or year > 2009) --> 157 all_results = (self.query(forty_nine_fields, geo, year, sort_by_geoid=sort_by_geoid, **kwargs) 158 for forty_nine_fields in chunks(fields, 49)) 159 merged_results = [merge(result) for result in zip(*all_results)] 161 return merged_results File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:60, in retry_on_transient_error.<locals>.wrapper(self, *args, **kwargs) 58 for _ in range(max(self.retries - 1, 0)): 59 try: ---> 60 result = func(self, *args, **kwargs) 61 except CensusException as e: 62 if "There was an error while running your query. We've logged the error and we'll correct it ASAP. Sorry for the inconvenience." in str(e): File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:198, in Client.query(self, fields, geo, year, sort_by_geoid, **kwargs) 195 raise ex 197 headers = data.pop(0) --> 198 types = [self._field_type(header, year) for header in headers] 199 results = [{header: (cast(item) if item is not None else None) 200 for header, cast, item 201 in zip(headers, types, d)} 202 for d in data] 203 if sort_by_geoid: File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:198, in <listcomp>(.0) 195 raise ex 197 headers = data.pop(0) --> 198 types = [self._field_type(header, year) for header in headers] 199 results = [{header: (cast(item) if item is not None else None) 200 for header, cast, item 201 in zip(headers, types, d)} 202 for d in data] 203 if sort_by_geoid: File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/census/core.py:219, in Client._field_type(self, field, year) 216 @lru_cache(maxsize=1024) 217 def _field_type(self, field, year): 218 url = self.definition_url % (year, self.dataset, field) --> 219 resp = self.session.get(url) 221 types = {"fips-for": str, 222 "fips-in": str, 223 "int": float_or_str, 224 "long": float_or_str, 225 "float": float, 226 "string": str} 228 if resp.status_code == 200: File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/sessions.py:602, in Session.get(self, url, **kwargs) 594 r"""Sends a GET request. Returns :class:`Response` object. 595 596 :param url: URL for the new :class:`Request` object. 597 :param \*\*kwargs: Optional arguments that ``request`` takes. 598 :rtype: requests.Response 599 """ 601 kwargs.setdefault("allow_redirects", True) --> 602 return self.request("GET", url, **kwargs) File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json) 584 send_kwargs = { 585 "timeout": timeout, 586 "allow_redirects": allow_redirects, 587 } 588 send_kwargs.update(settings) --> 589 resp = self.send(prep, **send_kwargs) 591 return resp File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs) 700 start = preferred_clock() 702 # Send the request --> 703 r = adapter.send(request, **kwargs) 705 # Total elapsed time of the request (approximately) 706 elapsed = preferred_clock() - start File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/requests/adapters.py:667, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies) 664 timeout = TimeoutSauce(connect=timeout, read=timeout) 666 try: --> 667 resp = conn.urlopen( 668 method=request.method, 669 url=url, 670 body=request.body, 671 headers=request.headers, 672 redirect=False, 673 assert_same_host=False, 674 preload_content=False, 675 decode_content=False, 676 retries=self.max_retries, 677 timeout=timeout, 678 chunked=chunked, 679 ) 681 except (ProtocolError, OSError) as err: 682 raise ConnectionError(err, request=request) File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:787, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw) 784 response_conn = conn if not release_conn else None 786 # Make the request on the HTTPConnection object --> 787 response = self._make_request( 788 conn, 789 method, 790 url, 791 timeout=timeout_obj, 792 body=body, 793 headers=headers, 794 chunked=chunked, 795 retries=retries, 796 response_conn=response_conn, 797 preload_content=preload_content, 798 decode_content=decode_content, 799 **response_kw, 800 ) 802 # Everything went great! 803 clean_exit = True File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:534, in HTTPConnectionPool._make_request(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length) 532 # Receive the response from the server 533 try: --> 534 response = conn.getresponse() 535 except (BaseSSLError, OSError) as e: 536 self._raise_timeout(err=e, url=url, timeout_value=read_timeout) File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connection.py:516, in HTTPConnection.getresponse(self) 513 _shutdown = getattr(self.sock, "shutdown", None) 515 # Get the response from http.client.HTTPConnection --> 516 httplib_response = super().getresponse() 518 try: 519 assert_header_parsing(httplib_response.msg) File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py:1395, in HTTPConnection.getresponse(self) 1393 try: 1394 try: -> 1395 response.begin() 1396 except ConnectionError: 1397 self.close() File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py:325, in HTTPResponse.begin(self) 323 # read until we get a non-100 response 324 while True: --> 325 version, status, reason = self._read_status() 326 if status != CONTINUE: 327 break File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/http/client.py:286, in HTTPResponse._read_status(self) 285 def _read_status(self): --> 286 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1") 287 if len(line) > _MAXLINE: 288 raise LineTooLong("status line") File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/socket.py:706, in SocketIO.readinto(self, b) 704 while True: 705 try: --> 706 return self._sock.recv_into(b) 707 except timeout: 708 self._timeout_occurred = True File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py:1314, in SSLSocket.recv_into(self, buffer, nbytes, flags) 1310 if flags != 0: 1311 raise ValueError( 1312 "non-zero flags not allowed in calls to recv_into() on %s" % 1313 self.__class__) -> 1314 return self.read(nbytes, buffer) 1315 else: 1316 return super().recv_into(buffer, nbytes, flags) File /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/ssl.py:1166, in SSLSocket.read(self, len, buffer) 1164 try: 1165 if buffer is not None: -> 1166 return self._sslobj.read(len, buffer) 1167 else: 1168 return self._sslobj.read(len) KeyboardInterrupt:
In [33]:
# Load and verify the Census data file that we previously created using the API
florida_df = pd.read_csv('acs_2023_florida_renamed.csv')
#############
# What are some ways to preview this dataset?
# Display basic dataset information
print("Dataset Info:")
florida_df.info()
# Display the first few rows of the dataset
print("\nPreview of the dataset:")
florida_df.head()
Dataset Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 5160 entries, 0 to 5159 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 census_tract 5160 non-null object 1 total_population 5160 non-null float64 2 median_household_income 5160 non-null float64 3 per_capita_income 5158 non-null float64 4 poverty_population 5160 non-null float64 5 poverty_below_population 5160 non-null float64 6 total_housing_units 5160 non-null float64 7 owner_occupied_units 5160 non-null float64 8 renter_occupied_units 5160 non-null float64 9 total_nativity_population 5160 non-null float64 10 foreign_born_population 5160 non-null float64 11 total_population_25plus 5160 non-null float64 12 high_school_graduate 5160 non-null float64 13 some_college_less_than_1_year 5160 non-null float64 14 some_college_more_than_1_year 5160 non-null float64 15 associates_degree 5160 non-null float64 16 bachelors_degree 5160 non-null float64 17 masters_degree 5160 non-null float64 18 professional_school_degree 5160 non-null float64 19 doctorate_degree 5160 non-null float64 20 total_school_enrollment 5160 non-null float64 21 nursery_to_high_school_enrollment 5160 non-null float64 22 college_or_grad_school_enrollment 5160 non-null float64 23 total_population_with_disability 5160 non-null float64 24 disability_under_18 5160 non-null float64 25 total_population_with_health_insurance 5160 non-null float64 26 males_18_24_with_health_insurance 5160 non-null float64 27 females_18_24_with_health_insurance 5160 non-null float64 28 state 5160 non-null int64 29 county 5160 non-null int64 30 tract 5160 non-null int64 dtypes: float64(27), int64(3), object(1) memory usage: 1.2+ MB Preview of the dataset:
Out[33]:
census_tract | total_population | median_household_income | per_capita_income | poverty_population | poverty_below_population | total_housing_units | owner_occupied_units | renter_occupied_units | total_nativity_population | ... | nursery_to_high_school_enrollment | college_or_grad_school_enrollment | total_population_with_disability | disability_under_18 | total_population_with_health_insurance | males_18_24_with_health_insurance | females_18_24_with_health_insurance | state | county | tract | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Census Tract 2.01; Alachua County; Florida | 5187.0 | 18657.0 | 16690.0 | 5006.0 | 3239.0 | 2319.0 | 153.0 | 2166.0 | 5187.0 | ... | 3987.0 | 3099.0 | 5185.0 | 13.0 | 5185.0 | 0.0 | 0.0 | 12 | 1 | 201 |
1 | Census Tract 2.02; Alachua County; Florida | 5897.0 | 17609.0 | 11493.0 | 4509.0 | 3069.0 | 1897.0 | 247.0 | 1650.0 | 5897.0 | ... | 4878.0 | 4152.0 | 5897.0 | 0.0 | 5897.0 | 0.0 | 0.0 | 12 | 1 | 202 |
2 | Census Tract 3.01; Alachua County; Florida | 3703.0 | 47813.0 | 28654.0 | 3703.0 | 1186.0 | 1855.0 | 427.0 | 1428.0 | 3703.0 | ... | 1429.0 | 778.0 | 3703.0 | 0.0 | 3703.0 | 0.0 | 0.0 | 12 | 1 | 301 |
3 | Census Tract 3.02; Alachua County; Florida | 2500.0 | 39583.0 | 25978.0 | 2500.0 | 394.0 | 1255.0 | 603.0 | 652.0 | 2500.0 | ... | 393.0 | 20.0 | 2500.0 | 0.0 | 2500.0 | 0.0 | 0.0 | 12 | 1 | 302 |
4 | Census Tract 4; Alachua County; Florida | 5736.0 | 51266.0 | 27362.0 | 5736.0 | 1075.0 | 2414.0 | 1297.0 | 1117.0 | 5736.0 | ... | 1637.0 | 336.0 | 5736.0 | 0.0 | 5736.0 | 0.0 | 0.0 | 12 | 1 | 400 |
5 rows × 31 columns
In [34]:
# Display column names
print("Columns in the dataset:")
print(florida_df.columns)
# Show summary statistics for numerical columns
print("\nSummary Statistics:")
florida_df.describe()
Columns in the dataset: Index(['census_tract', 'total_population', 'median_household_income', 'per_capita_income', 'poverty_population', 'poverty_below_population', 'total_housing_units', 'owner_occupied_units', 'renter_occupied_units', 'total_nativity_population', 'foreign_born_population', 'total_population_25plus', 'high_school_graduate', 'some_college_less_than_1_year', 'some_college_more_than_1_year', 'associates_degree', 'bachelors_degree', 'masters_degree', 'professional_school_degree', 'doctorate_degree', 'total_school_enrollment', 'nursery_to_high_school_enrollment', 'college_or_grad_school_enrollment', 'total_population_with_disability', 'disability_under_18', 'total_population_with_health_insurance', 'males_18_24_with_health_insurance', 'females_18_24_with_health_insurance', 'state', 'county', 'tract'], dtype='object') Summary Statistics:
Out[34]:
total_population | median_household_income | per_capita_income | poverty_population | poverty_below_population | total_housing_units | owner_occupied_units | renter_occupied_units | total_nativity_population | foreign_born_population | ... | nursery_to_high_school_enrollment | college_or_grad_school_enrollment | total_population_with_disability | disability_under_18 | total_population_with_health_insurance | males_18_24_with_health_insurance | females_18_24_with_health_insurance | state | county | tract | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 5160.00000 | 5.160000e+03 | 5.158000e+03 | 5160.000000 | 5160.000000 | 5160.000000 | 5160.000000 | 5160.000000 | 5160.00000 | 5160.000000 | ... | 5160.000000 | 5160.000000 | 5160.000000 | 5160.000000 | 5160.000000 | 5160.000000 | 5160.000000 | 5160.0 | 5160.000000 | 5160.000000 |
mean | 4249.78314 | -1.374931e+07 | -1.016864e+07 | 4158.910465 | 524.747674 | 1657.153295 | 1115.660659 | 541.492636 | 4249.78314 | 909.771124 | ... | 933.263566 | 207.125388 | 4184.416279 | 2.440504 | 4184.416279 | 7.444380 | 7.454845 | 12.0 | 71.692442 | 87157.662597 |
std | 2174.55402 | 9.502055e+07 | 8.188418e+07 | 2169.989070 | 436.279490 | 786.672294 | 679.156836 | 464.933551 | 2174.55402 | 915.272246 | ... | 746.118635 | 395.274525 | 2167.695027 | 11.741644 | 2167.695027 | 23.035979 | 22.739017 | 0.0 | 35.938525 | 229403.987266 |
min | 0.00000 | -6.666667e+08 | -6.666667e+08 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 12.0 | 1.000000 | 100.000000 |
25% | 2811.50000 | 5.323475e+04 | 2.767100e+04 | 2730.500000 | 216.000000 | 1136.000000 | 658.000000 | 202.000000 | 2811.50000 | 251.000000 | ... | 452.750000 | 65.000000 | 2747.750000 | 0.000000 | 2747.750000 | 0.000000 | 0.000000 | 12.0 | 39.000000 | 6402.750000 |
50% | 3973.00000 | 6.991300e+04 | 3.642950e+04 | 3885.000000 | 407.000000 | 1561.500000 | 1021.500000 | 413.000000 | 3973.00000 | 571.000000 | ... | 794.500000 | 139.000000 | 3904.000000 | 0.000000 | 3904.000000 | 0.000000 | 0.000000 | 12.0 | 86.000000 | 14405.000000 |
75% | 5377.00000 | 9.137600e+04 | 4.956750e+04 | 5259.000000 | 719.250000 | 2081.250000 | 1463.250000 | 747.000000 | 5377.00000 | 1285.250000 | ... | 1228.000000 | 248.000000 | 5291.000000 | 0.000000 | 5291.000000 | 0.000000 | 0.000000 | 12.0 | 99.000000 | 40202.000000 |
max | 24659.00000 | 2.500010e+05 | 3.007780e+05 | 24659.000000 | 3303.000000 | 7964.000000 | 7238.000000 | 3745.000000 | 24659.00000 | 7585.000000 | ... | 13134.000000 | 13104.000000 | 24659.000000 | 299.000000 | 24659.000000 | 370.000000 | 583.000000 | 12.0 | 133.000000 | 990200.000000 |
8 rows × 30 columns
In [35]:
# Define a variable we can use to select different input features for the analysis
# I keep a few different sets here using comments to choose one at a time
cluster_df = florida_df[['median_household_income', 'poverty_population',
'foreign_born_population', 'high_school_graduate', 'bachelors_degree',
'total_population_with_health_insurance',]]
#############
# After we a run the one above together, you will try your own with four variables.
#############
# Preview our selected data so we can see what will be clustered
display(cluster_df)
median_household_income | poverty_population | foreign_born_population | high_school_graduate | bachelors_degree | total_population_with_health_insurance | |
---|---|---|---|---|---|---|
0 | 18657.0 | 5006.0 | 715.0 | 75.0 | 178.0 | 5185.0 |
1 | 17609.0 | 4509.0 | 226.0 | 95.0 | 78.0 | 5897.0 |
2 | 47813.0 | 3703.0 | 213.0 | 442.0 | 131.0 | 3703.0 |
3 | 39583.0 | 2500.0 | 199.0 | 421.0 | 206.0 | 2500.0 |
4 | 51266.0 | 5736.0 | 263.0 | 848.0 | 170.0 | 5736.0 |
... | ... | ... | ... | ... | ... | ... |
5155 | 45469.0 | 2342.0 | 69.0 | 570.0 | 127.0 | 2434.0 |
5156 | 53220.0 | 3208.0 | 27.0 | 901.0 | 127.0 | 3157.0 |
5157 | 58099.0 | 2671.0 | 27.0 | 682.0 | 98.0 | 2712.0 |
5158 | 56324.0 | 4798.0 | 300.0 | 1790.0 | 353.0 | 4798.0 |
5159 | 48428.0 | 4086.0 | 71.0 | 993.0 | 297.0 | 4104.0 |
5160 rows × 6 columns
In [37]:
# Scale our features data to range between 0 and 1
# To do so we use the MinMaxScaler
# Note that this outputs a numpy array
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Transform the data
X = scaler.fit_transform(cluster_df)
display(pd.DataFrame(X,columns=cluster_df.columns))
median_household_income | poverty_population | foreign_born_population | high_school_graduate | bachelors_degree | total_population_with_health_insurance | |
---|---|---|---|---|---|---|
0 | 0.999653 | 0.203009 | 0.094265 | 0.020309 | 0.078380 | 0.210268 |
1 | 0.999652 | 0.182854 | 0.029796 | 0.025724 | 0.034346 | 0.239142 |
2 | 0.999697 | 0.150168 | 0.028082 | 0.119686 | 0.057684 | 0.150168 |
3 | 0.999684 | 0.101383 | 0.026236 | 0.113999 | 0.090709 | 0.101383 |
4 | 0.999702 | 0.232613 | 0.034674 | 0.229624 | 0.074857 | 0.232613 |
... | ... | ... | ... | ... | ... | ... |
5155 | 0.999693 | 0.094975 | 0.009097 | 0.154346 | 0.055923 | 0.098706 |
5156 | 0.999705 | 0.130094 | 0.003560 | 0.243975 | 0.055923 | 0.128026 |
5157 | 0.999712 | 0.108317 | 0.003560 | 0.184674 | 0.043153 | 0.109980 |
5158 | 0.999710 | 0.194574 | 0.039552 | 0.484701 | 0.155438 | 0.194574 |
5159 | 0.999698 | 0.165700 | 0.009361 | 0.268887 | 0.130779 | 0.166430 |
5160 rows × 6 columns
In [38]:
#this code is going to tell us how many clusters we should use in our PCA
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Ensure X is a Pandas DataFrame
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
# Ensure X contains only numerical data and no missing values
X = X.dropna() # Drop missing values
X = X.select_dtypes(include=[np.number]) # Keep only numeric columns
# Initialize list to store inertia values
inertias = []
# Loop through possible cluster sizes
for i in range(2, 11): # Start at 2 clusters
kmeans_elbow = KMeans(n_clusters=i, n_init="auto", random_state=42)
kmeans_elbow.fit(X)
inertias.append(kmeans_elbow.inertia_)
# Plot the Elbow Method
plt.plot(range(2, 11), inertias, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()
In [39]:
''' 1. Steep Drop Until K=4
• The inertia decreases rapidly from K=2 to K=4, meaning adding more clusters significantly improves the fit (reduces variance within clusters).
• This suggests that 4 clusters capture major structure in the data.
2. Gradual Decrease After K=4
• Beyond K=4, the improvement slows down—indicating diminishing returns.
• This suggests that additional clusters don’t add much new structure but instead refine existing groups.
3. Elbow Point at K=4
• The elbow (where the curve starts flattening) appears around K=4.
• This suggests that 4 clusters might be an optimal choice, balancing simplicity and accuracy.
Since you used many features (education, income, poverty, housing, health, immigration):
• Each cluster likely represents distinct socio-economic groups in Florida.
• K=4 means we have 4 dominant socio-economic patterns based on the input features.
Possible Cluster Interpretations (Hypothesis)
• Cluster 1: High-income, highly-educated, high homeownership areas.
• Cluster 2: Middle-income, mixed education, moderate homeownership.
• Cluster 3: Lower-income, lower education, high rental population.
• Cluster 4: High foreign-born, lower median income, variable education.'''
Out[39]:
'\t1.\tSteep Drop Until K=4\n\t•\tThe inertia decreases rapidly from K=2 to K=4, meaning adding more clusters significantly improves the fit (reduces variance within clusters).\n\t•\tThis suggests that 4 clusters capture major structure in the data.\n\t2.\tGradual Decrease After K=4\n\t•\tBeyond K=4, the improvement slows down—indicating diminishing returns.\n\t•\tThis suggests that additional clusters don’t add much new structure but instead refine existing groups.\n\t3.\tElbow Point at K=4\n\t•\tThe elbow (where the curve starts flattening) appears around K=4.\n\t•\tThis suggests that 4 clusters might be an optimal choice, balancing simplicity and accuracy.\n\n\nSince you used many features (education, income, poverty, housing, health, immigration):\n\t•\tEach cluster likely represents distinct socio-economic groups in Florida.\n\t•\tK=4 means we have 4 dominant socio-economic patterns based on the input features.\n\nPossible Cluster Interpretations (Hypothesis)\n\t•\tCluster 1: High-income, highly-educated, high homeownership areas.\n\t•\tCluster 2: Middle-income, mixed education, moderate homeownership.\n\t•\tCluster 3: Lower-income, lower education, high rental population.\n\t•\tCluster 4: High foreign-born, lower median income, variable education.'
In [40]:
# Conduct the K-means analysis
# First defining a variable to control number of k categories
n_k = 4
from sklearn.cluster import KMeans
kmeans_model = KMeans(n_clusters=n_k, random_state=0, n_init="auto").fit(X)
# Display out cluster center means
display(pd.DataFrame(np.round(kmeans_model.cluster_centers_, decimals=4),columns=cluster_df.columns))
median_household_income | poverty_population | foreign_born_population | high_school_graduate | bachelors_degree | total_population_with_health_insurance | |
---|---|---|---|---|---|---|
0 | 0.9997 | 0.3078 | 0.3113 | 0.3522 | 0.2592 | 0.3081 |
1 | 0.0000 | 0.0059 | 0.0125 | 0.0147 | 0.0060 | 0.0224 |
2 | 0.9997 | 0.1098 | 0.0606 | 0.1172 | 0.0838 | 0.1109 |
3 | 0.9997 | 0.1981 | 0.1294 | 0.2375 | 0.1626 | 0.1985 |
In [ ]:
'''The code focuses on applying PCA to a dataset to reduce the dimensionality and identify the most significant features contributing to the variability in the data. Principle components are linear combinations of the original features, capturing the most significant variance in the dataset.
PCA is used to calculate how much variance is explained by each of the principal components.
The first principal component (PC1) explains 60.6% of the variance, while the second principal component (PC2) explains 15.2%.
Together, they capture 75.8% of the total variation in the dataset.
Cluster 1 (Purple): Likely represents low-income areas, with lower education and higher poverty levels.
Cluster 2 (Green): Likely represents higher-income areas with higher education levels. Cluster 3 (Yellow): Potential outliers or special
cases, such as urban centers or areas with unique socio-economic traits.
'''
In [41]:
# Predict and visualize the clusters using pairs of features
# First, make the predictions
y_label = kmeans_model.fit_predict(X) # Cluster assignments
# Convert X to DataFrame if it's not already one
if isinstance(X, np.ndarray):
X = pd.DataFrame(X, columns=cluster_df.columns)
# Set feature columns for visualization
column_x = 1 # Choose a valid feature column index
column_y = 2 # Choose another valid feature column index
# Ensure column indices are within bounds
if column_x >= X.shape[1] or column_y >= X.shape[1]:
raise ValueError("Column indices out of bounds. Check your column selection.")
# Scatter plot using clustering results
plt.scatter(X.iloc[:, column_x], X.iloc[:, column_y], c=y_label, cmap="viridis", s=20)
# Label axes correctly
plt.xlabel(X.columns[column_x])
plt.ylabel(X.columns[column_y])
plt.title("Cluster Visualization")
plt.colorbar(label="Cluster")
plt.show()
In [5]:
'''Above: The plot shows how the data points (likely census tracts or similar regions) cluster together based on their principal components. The colors represent different clusters. The separation in the PCA space indicates that the data points group into distinct clusters.
Cluster 1 (Purple): Likely represents low-income areas, with lower education and higher poverty levels. Cluster 2 (Green): Likely represents higher-income areas with higher education levels. Cluster 3 (Yellow): Potential outliers or special cases, such as urban centers or areas with unique socio-economic traits.'''
Out[5]:
'Above: The plot shows how the data points (likely census tracts or similar regions) cluster together based on their principal components. The colors represent different clusters. The separation in the PCA space indicates that the data points group into distinct clusters.\n\nCluster 1 (Purple): Likely represents low-income areas, with lower education and higher poverty levels. Cluster 2 (Green): Likely represents higher-income areas with higher education levels. Cluster 3 (Yellow): Potential outliers or special cases, such as urban centers or areas with unique socio-economic traits.'
In [42]:
# Convert y_label to a Pandas Series if it's not already
if isinstance(y_label, np.ndarray):
y_label = pd.Series(y_label, index=X.index)
# Then rerun the scatter plot
for i in range(n_k):
plt.scatter(X.loc[y_label == i, column_x], X.loc[y_label == i, column_y], s=2, label='Cluster '+str(i))
plt.xlabel(cluster_df.columns[column_x])
plt.ylabel(cluster_df.columns[column_y])
plt.legend(loc="lower right")
plt.show()
In [43]:
# Visualize the tree (dendrogram) for this data
from sklearn.cluster import AgglomerativeClustering
agglom_model = AgglomerativeClustering(distance_threshold=0,n_clusters=None)
agglom_model = agglom_model.fit(X)
from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(agglom_model, truncate_mode="level", p=5)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
In [3]:
'''above: In essence, the dendrogram shows how data points or objects are hierarchically grouped based on similarity, with each successive level of merging representing a broader grouping of data points. By choosing a particular cut-off on the y-axis, you can determine how many clusters you want to identify in your data.
This technique is useful for identifying patterns, relationships, and the number of natural clusters present in the data. Would you like to dive deeper into how hierarchical clustering was applied in this context, or would you like help interpreting any specific part of the dendrogram?'''
Out[3]:
'above: In essence, the dendrogram shows how data points or objects are hierarchically grouped based on similarity, with each successive level of merging representing a broader grouping of data points. By choosing a particular cut-off on the y-axis, you can determine how many clusters you want to identify in your data.\n\nThis technique is useful for identifying patterns, relationships, and the number of natural clusters present in the data. Would you like to dive deeper into how hierarchical clustering was applied in this context, or would you like help interpreting any specific part of the dendrogram?'
In [1]:
'''Below: Principal Component Analysis (PCA) is a statistical technique used for dimensionality reduction while preserving as much
variability (information) as possible in the data. It transforms a large set of possibly correlated variables into a smaller set of
uncorrelated variables known as principal components. These components are linear combinations of the original variables and
capture the most significant variance in the data.
'''
Out[1]:
'Below: Principal Component Analysis (PCA) is a statistical technique used for dimensionality reduction while preserving as much\nvariability (information) as possible in the data. It transforms a large set of possibly correlated variables into a smaller set of\nuncorrelated variables known as principal components. These components are linear combinations of the original variables and \ncapture the most significant variance in the data.\n'
In [44]:
# Import necessary libraries
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
# Step 1: Standardize the data (excluding non-numeric columns)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(cluster_df) # Ensure `cluster_df` contains only numerical features
# Step 2: Apply PCA to reduce dimensions
pca = PCA(n_components=2) # Reduce to 2 principal components
X_pca = pca.fit_transform(X_scaled)
# Step 3: Convert PCA result into a DataFrame
pca_df = pd.DataFrame(X_pca, columns=["PC1", "PC2"])
pca_df["cluster"] = y_label # Attach cluster labels
# Step 4: Check explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)
# Step 5: Re-run KMeans on PCA-transformed data
kmeans_pca = KMeans(n_clusters=4, random_state=42, n_init="auto")
pca_df["cluster_pca"] = kmeans_pca.fit_predict(X_pca)
# Step 6: Visualize Clusters in PCA space
plt.figure(figsize=(10,6))
sns.scatterplot(x=pca_df["PC1"], y=pca_df["PC2"], hue=pca_df["cluster_pca"], palette="viridis", s=50)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("Clusters in PCA-Transformed Space")
plt.legend(title="Cluster (PCA)")
plt.show()
Explained variance ratio: [0.60693438 0.15239682]
In [45]:
print(X.columns) # This will show the order of original features before PCA
Index([0, 1, 2, 3, 4, 5], dtype='int64')
In [59]:
"""
Understanding PCA Variance Explained
• The first principal component (PC1) explains 60.69% of the variance.
• The second principal component (PC2) explains 15.23% of the variance.
• Together, they capture ~76% of the total variation in the dataset.
What This Means:
• PC1 is likely capturing the dominant socio-economic factors that differentiate census tracts (e.g., income, education levels, and housing status).
• PC2 adds additional variation but has a weaker influence compared to PC1.
Cluster Formation in PCA Space
• The clusters are well-separated, which means K-Means was able to group census tracts into meaningful categories.
• The yellow cluster (Cluster 3) is distinct from all others, suggesting it represents an outlier group with very different characteristics (e.g., very high/low income, foreign-born population, or housing differences).
• The other clusters (purple, blue, green) are spread along PC1, indicating they are mostly structured around the primary factor (likely income/education).
Possible Socio-Economic Interpretation:
• Cluster 0 (Purple): Likely lower-income, lower education, higher poverty.
• Cluster 1 (Blue): Middle-income, moderate education levels.
• Cluster 2 (Green): Higher-income, higher education levels.
• Cluster 3 (Yellow): Outlier group (possibly wealthy urban areas or low-population rural areas).
Cell In[59], line 1 """ ^ SyntaxError: incomplete input
In [47]:
# Fit PCA again if not already done
pca = PCA(n_components=2)
pca.fit(X) # X is the original dataset (scaled and numerical)
# Get the loadings (contributions) of each feature to PC1 and PC2
loadings = pd.DataFrame(pca.components_, columns=X.columns, index=['PC1', 'PC2'])
# Transpose for readability
loadings = loadings.T
# Sort the absolute values of contributions for better visualization
loadings['Abs_PC1'] = loadings['PC1'].abs()
loadings['Abs_PC2'] = loadings['PC2'].abs()
# Sort by importance
pc1_top_features = loadings.sort_values(by='Abs_PC1', ascending=False)[['PC1']]
pc2_top_features = loadings.sort_values(by='Abs_PC2', ascending=False)[['PC2']]
# Print top contributing features
print("\n📌 **Top Variables Contributing to PC1 (Primary Factor)**")
print(pc1_top_features.head(10))
print("\n📌 **Top Variables Contributing to PC2 (Secondary Factor)**")
print(pc2_top_features.head(10))
# 🔥 Heatmap Visualization of Contributions
plt.figure(figsize=(10, 6))
sns.heatmap(loadings[['PC1', 'PC2']], cmap="coolwarm", annot=True, fmt=".2f", linewidths=0.5)
plt.title("Feature Contributions to Principal Components")
plt.xlabel("Principal Components")
plt.ylabel("Original Features")
plt.show()
📌 **Top Variables Contributing to PC1 (Primary Factor)** PC1 3 0.478107 1 0.414736 5 0.408578 2 0.398959 0 0.370244 4 0.369085 📌 **Top Variables Contributing to PC2 (Secondary Factor)** PC2 0 0.923924 2 -0.250887 3 -0.158334 5 -0.149715 1 -0.135093 4 -0.132991
In [7]:
''' What This Chart Tells Us
• The top table lists the variables that contribute most to PC1 (Primary Factor) and PC2 (Secondary Factor).
• The heatmap shows how strongly each original variable correlates with PC1 and PC2 (positive in red, negative in blue).
• Higher absolute values in the table/heatmap mean those features are the most influential in defining each principal component.
Understanding PC1 (Primary Factor)
• PC1 has high positive contributions from certain variables (e.g., 0.478107, 0.414736, 0.408578, etc.).
• This means these variables account for most of the variance in the dataset.
• From the variance ratio (60.7%), PC1 is explaining most of the structure in the dataset.
→ We need to check which variables correspond to these values.
Understanding PC2 (Secondary Factor)
• PC2 has one high positive contributor (0.923924) and some negative contributors (-0.250887, -0.158334, etc.).
• PC2 is explaining 15.2% of the variance, meaning it’s a less dominant factor than PC1 but still captures important secondary relationships.
• Since PC2 has mixed positive and negative values, it likely contrasts two different types of variables.
High Contributions Mean:
Key Drivers of Socio-Economic Structure: Since PC1 is capturing the most variance, the features with the highest loadings
(income, education, housing) are typically the major drivers of socio-economic conditions in the dataset. These features help
define the broad economic landscape of the population being studied.
Meaningful Variation: When PC1 has high contributions from features like income, education, and housing, it suggests that these features
are strongly correlated and vary together across the dataset. Essentially, the first principal component might represent a general
“economic development” axis, where higher income, education, and housing correlate with more affluent, better-educated, and
well-housed areas.
'''
Out[7]:
' What This Chart Tells Us\n\t•\tThe top table lists the variables that contribute most to PC1 (Primary Factor) and PC2 (Secondary Factor).\n\t•\tThe heatmap shows how strongly each original variable correlates with PC1 and PC2 (positive in red, negative in blue).\n\t•\tHigher absolute values in the table/heatmap mean those features are the most influential in defining each principal component.\n\n Understanding PC1 (Primary Factor)\n\t•\tPC1 has high positive contributions from certain variables (e.g., 0.478107, 0.414736, 0.408578, etc.).\n\t•\tThis means these variables account for most of the variance in the dataset.\n\t•\tFrom the variance ratio (60.7%), PC1 is explaining most of the structure in the dataset.\n→ We need to check which variables correspond to these values.\n\nUnderstanding PC2 (Secondary Factor)\n\t•\tPC2 has one high positive contributor (0.923924) and some negative contributors (-0.250887, -0.158334, etc.).\n\t•\tPC2 is explaining 15.2% of the variance, meaning it’s a less dominant factor than PC1 but still captures important secondary relationships.\n\t•\tSince PC2 has mixed positive and negative values, it likely contrasts two different types of variables.\n \n \n \n High Contributions Mean:\nKey Drivers of Socio-Economic Structure: Since PC1 is capturing the most variance, the features with the highest loadings\n(income, education, housing) are typically the major drivers of socio-economic conditions in the dataset. These features help\ndefine the broad economic landscape of the population being studied.\nMeaningful Variation: When PC1 has high contributions from features like income, education, and housing, it suggests that these features\nare strongly correlated and vary together across the dataset. Essentially, the first principal component might represent a general \n“economic development” axis, where higher income, education, and housing correlate with more affluent, better-educated, and \nwell-housed areas.\n'
In [2]:
'''Notes for me: Understanding Principal Components:
Principal Components (PCs) are new variables derived from the original data that are linear combinations of the original features. These combinations are constructed in such a way that the first principal component (PC1) explains the most variance in the data, the second principal component (PC2) explains the second most variance, and so on.
Role of PC1 (Primary Component):
• Variance Capture: PC1 captures the most significant patterns of variation in the data. The idea is to find a new axis (in the multidimensional space of the original features) along which the data varies the most.
• Linear Combination: The principal component is formed as a weighted sum of the original features (variables), where each weight (called a “loading”) indicates the importance of the corresponding feature in defining that component.'''
Out[2]:
'Understanding Principal Components:\n\nPrincipal Components (PCs) are new variables derived from the original data that are linear combinations of the original features. These combinations are constructed in such a way that the first principal component (PC1) explains the most variance in the data, the second principal component (PC2) explains the second most variance, and so on.\n\nRole of PC1 (Primary Component):\n\t•\tVariance Capture: PC1 captures the most significant patterns of variation in the data. The idea is to find a new axis (in the multidimensional space of the original features) along which the data varies the most.\n\t•\tLinear Combination: The principal component is formed as a weighted sum of the original features (variables), where each weight (called a “loading”) indicates the importance of the corresponding feature in defining that component.'
In [51]:
'''So what are we actually analyzing?
We are trying to understand the underlying structure of the data by seeing which features (income, education, housing, etc.) contribute the most to the principal components (PC1, PC2, etc.).
Biggest drivers of what?
$
The principal components (PC1, PC2, etc.) represent the most important underlying dimensions in the dataset. So, when we analyze which features dominate PC1 & PC2, we are identifying which types of socioeconomic factors best explain variations across different census tracts in Florida.
Key questions we are answering:
• PC1 (Primary Component): What is the main factor that differentiates census tracts?
→ Is it income levels, education attainment, housing ownership, or something else?
• PC2 (Secondary Component): What is the second biggest differentiator between census tracts?'''
Out[51]:
'So what are we actually analyzing?\n\nWe are trying to understand the underlying structure of the data by seeing which features (income, education, housing, etc.) contribute the most to the principal components (PC1, PC2, etc.).\n\nBiggest drivers of what?\n\nThe principal components (PC1, PC2, etc.) represent the most important underlying dimensions in the dataset. So, when we analyze which features dominate PC1 & PC2, we are identifying which types of socioeconomic factors best explain variations across different census tracts in Florida.\n\nKey questions we are answering:\n\t•\tPC1 (Primary Component): What is the main factor that differentiates census tracts?\n→ Is it income levels, education attainment, housing ownership, or something else?\n\t•\tPC2 (Secondary Component): What is the second biggest differentiator between census tracts?'
In [52]:
pc1_top_features = loadings.sort_values(by="Abs_PC1", ascending=False)[["PC1"]].head(10)
pc2_top_features = loadings.sort_values(by="Abs_PC2", ascending=False)[["PC2"]].head(10)
print(pc1_top_features) # Should show actual feature names, not numbers
print(pc2_top_features)
PC1 3 0.478107 1 0.414736 5 0.408578 2 0.398959 0 0.370244 4 0.369085 PC2 0 0.923924 2 -0.250887 3 -0.158334 5 -0.149715 1 -0.135093 4 -0.132991
In [ ]: