JGCJ2CQPYNRNLDQGVRIDIB5EGKW2O7PCTAIHKZTTJQXE2VTY27QQC
NMT3JJPPFHOOL37U6JJIUSBCL5C6BRLLH6CDO3TS5YKRDJPK6NFAC
2AE4VE7UAQOEXBTOWOSXSEC2Q5PII4PNZH6MZFCLGNY3LGJOGKIAC
7ZN6HHL2PQTLSZCD3AHXWAUE5ZGNY4S5PFR2XCDYR35GFGASPDLQC
QU7MDQED456DFUKHLHCS2SS5QSLDYFU233YNQ27KY2TOVDMTSUKAC
QDEBWR2RIYZNPXOVOSDXPXTXOZ3NVJHRAUDHPCRZXE5DFT2VRCMQC
FCHAPZLDDTY46FYACRWZOPVXYLCSUBPZVNYYGRIUO3IFC6BM2TZAC
4YKXEBAVKKIWUC6EIG4MGSTZOHD7XJSO5O4YNU3OK5SPKMIWX7YQC
# Simple function that says whether or not to use a category
def should_keep(category):
# If any of these phrases are in the category, it's not allowed!
bad_words = [
"Neg_Ni_inc", # Negative/nil income
"more", # Brackets over a certain threshold (eg $4000more)
"or more",
"NS", # Not stated
"Notstated",
"Tot", # Total
"PI_S",
]
for bad_word in bad_words:
if bad_word in category:
# Bad word! Tell them not to use this category
return False
# Made it to the end without any bad words, should be fine
return True
# Take a valid category and return the year, mean income & rent
def split_category(category):
# Example category: 'C11_1_149_R1_74'
parts = category.split("_")
# Make sure it's a valid category
if not should_keep(category):
print(f"Invalid category: {category}")
assert should_keep(category) is True
# All valid categories have 5 parts
assert len(parts) == 5
# Split up the parts
year = parts[0]
min_income = int(parts[1])
max_income = int(parts[2])
max_rent = int(parts[4])
# Year starts with 'C' followed by 2 digits
assert len(year) == 3
assert year[0] == "C"
assert year[1:].isdigit()
# `min_rent` is special because it has the 'R' prefix
min_rent = parts[3]
# Make sure the first character is an 'R'
assert min_rent[0] == "R"
# Then we can just convert it into an int like the others
min_rent = int(min_rent[1:])
mean_income = (min_income + max_income + 1) // 2
mean_rent = (min_rent + max_rent + 1) // 2
return (year, mean_income, mean_rent)
for filename in files:
file = open(filename, "r").readlines()
# Split each row on the comma
file = [[col.strip() for col in row.split(",")] for row in file]
# First line in file (file[0]) is headings
first_line = file[0]
# Don't need the first column, as it says 'STE_CODE_2021'
# Example value for `categories_for_file`:
# ['C11_Neg_Ni_inc_R1_74', 'C11_Neg_Ni_inc_R75_99', 'C11_Neg_Ni_inc_R100_149', etc]
categories_for_file = first_line[1:]
# When we need to use categories, use `categories_for_file`
# The `file` variable should be JUST data, so remove the first line
# Example value:
"""
[
['1', '400', '505', etc],
['2', '241', '297', etc],
etc
]
"""
file = file[1:]
def median_incomes(self):
incomes = {}
for column in range(len(self.categories)):
category = self.categories[column]
# Get a list of all the areas (states, could later be LGAs)
# Should end up looking like:
# ['1', '2', '3', etc]
areas = []
for row in range(len(file)):
# Area code is the first column of each row
# Example value: '1'
first_cell = file[row][0]
# Add that to list of all areas
areas.append(first_cell)
total_income = 0
for row in range(len(self.data)):
total_income += int(self.data[row][column])
# Example value: 'C11_Neg_Ni_inc_R1_74'
# Split the category up into parts (separated by underscores)
category = category.split("_")
# Example value for category: ['C11', 'Neg', 'Ni', etc]
# Year is always prefixed, so get everything before first underscore
# Example value: 'C11' (first item in category)
year = category[0]
# Category is everything after the first underscore
# Example value: 'Neg_Ni_inc_R1_74'
category = "_".join(category[1:])
def median_rents(self):
rents = {}
for column in range(len(self.categories)):
category = self.categories[column]
# Add that category to the list of categories (without year)
# Example value: 'Neg_Ni_inc_R1_74' (no year included!!)
# TODO: assert all categories are the same across years
categories.append(category)
total_rent = 0
for row in range(len(self.data)):
total_rent += int(self.data[row][column])
# Now add every row in this column
for row in range(len(file)):
# years[year] is a dict mapping states to rows
# Every key is essentially one row of data in the csv
# It will end up looking something like:
"""
{
'1': [1, 2, 3, etc],
'2': [3, 1, 2, etc],
etc
}
"""
# Example value: first row (row = 0), area = '1'
area = areas[row]
return rents
# Simple function that says whether or not to use a category
def should_keep(category):
# If any of these phrases are in the category, it's not allowed!
bad_words = [
"Neg_Ni_inc",
"4000more",
"R650more",
"R_NS",
"R_Tot",
"PI_S",
"or more",
"Notstated",
]
for bad_word in bad_words:
if bad_word in category:
# Bad word! Tell them not to use this category
return False
# Made it to the end without any bad words, should be fine
return True
# Take a valid category and return the mean income & rent
def split_category(category):
# Example category: '1_149_R1_74'
parts = category.split("_")
# Make sure it's a valid category
if not should_keep(category):
print(category)
assert should_keep(category) is True
# All valid categories have 4 parts
assert len(parts) == 4
mean_income = round((min_income + max_income) / 2)
mean_rent = round((min_rent + max_rent) / 2)
return (mean_income, mean_rent)
# Second step: split 2-variable categories into table
def reshape_into_table(categories, years):
# Example value: 'C11'
for year in years:
# Example value: '1'
for area in years[year]:
# TODO: categories may change between iterations
# This should look like:
"""
[
# This is income of $1-$149
[
# This is rent of $1-$74
1535,
# This is rent of $75-$99
1580,
etc
],
# This is income of $150-$299
[
# This is rent of $1-$74
4591,
# This is rent of $75-$99
2188,
etc
],
etc
]
"""
new_data = []
# The last mean income we saw
last_income = None
# Make sure there are as many categories as data
assert len(years[year][area]) == len(categories)
# Go through each 2-variable category and data pair
for column in range(len(categories)):
# Get the relevant category
category = categories[column]
# Check if the category is part of our dataset
if should_keep(category) is True:
income, rent = split_category(category)
class Census2011_2016_2021(CensusDataset):
def __init__(self, files):
census2011 = None
census2016 = None
census2021 = None
# Append our rent information to the most recent bracket
# The most recent bracket will always be the one we want
current_cell = years[year][area][column]
current_cell = int(current_cell)
new_data[-1].append(current_cell)
# First-time setup requires opening the first file
if file_index == 0:
census2011 = [[] for row in range(len(file))]
census2016 = [[] for row in range(len(file))]
census2021 = [[] for row in range(len(file))]
# Update this from old->new data
years[year][area] = new_data
# Headings are the first row
headings = file[0]
# Make sure to skip the first column (STE_CODE_2021)
for column in range(1, len(file[0])):
if should_keep(headings[column]):
(year, mean_income, mean_rent) = split_category(headings[column])
incomes = []
rents = []
for category in categories:
if should_keep(category):
(mean_income, mean_rent) = split_category(category)
if mean_income not in incomes:
incomes.append(mean_income)
if mean_rent not in rents:
rents.append(mean_rent)
# Add each cell to its corresponding row
for row in range(len(file)):
if row == 0:
cell = (mean_income, mean_rent)
else:
cell = file[row][column]
# (categories, years) = merge_files([TIME_SERIES_A])
# (categories, years) = reshape_into_table(categories, years)
# print(categories)
self.census2011 = Census2011(census2011)
self.census2016 = Census2016(census2016)
self.census2021 = Census2021(census2021)
print(Census1996().median_incomes())
print(Census1996().median_rents())
print(Census2001().median_incomes())
print(Census2001().median_rents())
# print(Census1996().median_incomes())
# print(Census1996().median_rents())
# print(Census2001().median_incomes())
# print(Census2001().median_rents())