SANDAG
diff --git a/‎python/employment.py‎
Lines changed: 151 additions & 43 deletions b/‎python/employment.py‎
Lines changed: 151 additions & 43 deletions
diff --git a/‎python/tests.py‎
Lines changed: 6 additions & 1 deletion b/‎python/tests.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎python/utils.py‎
Lines changed: 5 additions & 4 deletions b/‎python/utils.py‎
Lines changed: 5 additions & 4 deletions
@@ -12,15 +12,15 @@
 
 
 def run_employment(year: int, debug: bool):
-    """Control function to create jobs data by naics_code (NAICS) at the MGRA level.
+    """Control function to create jobs data by industry_code at the MGRA level.
 
     Get the LEHD LODES data, aggregate to the MGRA level using the block to MGRA
     crosswalk, then apply control totals from QCEW using integerization.
 
     Functionality is split apart for code encapsulation (function inputs not included):
         _get_jobs_inputs - Get all input data related to jobs, including LODES data,
             block to MGRA crosswalk, and control totals from QCEW. Then process the
-            LODES data to the MGRA level by naics_code.
+            LODES data to the MGRA level by industry_code.
         _validate_jobs_inputs - Validate the input tables from the above function
         _create_jobs_output - Apply control totals to employment data using
             utils.integerize_1d() and create output table
@@ -34,14 +34,14 @@ def run_employment(year: int, debug: bool):
     jobs_inputs = _get_jobs_inputs(year)
     _validate_jobs_inputs(jobs_inputs)
 
-    jobs_outputs = _create_jobs_output(jobs_inputs)
+    jobs_outputs = _create_jobs_output(jobs_inputs, year)
     _validate_jobs_outputs(jobs_outputs)
 
     _insert_jobs(jobs_inputs, jobs_outputs, debug)
 
 
 def _get_lodes_data(year: int) -> pd.DataFrame:
-    """Retrieve LEHD LODES data for a specified year and split naics_code 72 into
+    """Retrieve LEHD LODES data for a specified year and split industry_code 72 into
         721 and 722 using split percentages.
 
     Args:
@@ -68,23 +68,23 @@ def _get_lodes_data(year: int) -> pd.DataFrame:
                 params={"year": year},
             )
 
-    # Split naics_code 72 and combine with other industries
-    lodes_72_split = lodes_data.loc[lambda df: df["naics_code"] == "72"].merge(
+    # Split industry_code 72 and combine with other industries
+    lodes_72_split = lodes_data.loc[lambda df: df["industry_code"] == "72"].merge(
         split_naics_72, on="block", how="left"
     )
 
     combined_data = pd.concat(
         [
-            lodes_data.loc[lambda df: df["naics_code"] != "72"],
+            lodes_data.loc[lambda df: df["industry_code"] != "72"],
             lodes_72_split.assign(
-                naics_code="721", jobs=lambda df: df["jobs"] * df["pct_721"]
+                industry_code="721", jobs=lambda df: df["jobs"] * df["pct_721"]
             ),
             lodes_72_split.assign(
-                naics_code="722", jobs=lambda df: df["jobs"] * df["pct_722"]
+                industry_code="722", jobs=lambda df: df["jobs"] * df["pct_722"]
             ),
         ],
         ignore_index=True,
-    )[["year", "block", "naics_code", "jobs"]]
+    )[["year", "block", "industry_code", "jobs"]]
 
     return combined_data
 
@@ -100,13 +100,13 @@ def _aggregate_lodes_to_mgra(
     MGRAs.
 
     Args:
-        combined_data: LODES data with columns: year, block, naics_code, jobs
+        combined_data: LODES data with columns: year, block, industry_code, jobs
         xref: Crosswalk with columns: block, mgra, pct_edd, pct_area, edd_flag
         year: The year for which to aggregate data
 
     Returns:
         Aggregated data at MGRA level with columns: run_id, year, mgra,
-            naics_code, value
+            industry_code, value
     """
     # Get MGRA data from SQL
     with utils.ESTIMATES_ENGINE.connect() as con:
@@ -124,28 +124,83 @@ def _aggregate_lodes_to_mgra(
         )
 
     # Get unique industry codes and cross join with MGRA data
-    unique_industries = combined_data["naics_code"].unique()
+    unique_industries = combined_data["industry_code"].unique()
     jobs = (
-        mgra_data.merge(pd.DataFrame({"naics_code": unique_industries}), how="cross")
+        mgra_data.merge(pd.DataFrame({"industry_code": unique_industries}), how="cross")
         .assign(year=year)
         .merge(
             combined_data.merge(xref, on="block", how="inner")
             .assign(
                 value=lambda df: df["jobs"]
                 * np.where(df["edd_flag"] == 1, df["pct_edd"], df["pct_area"])
             )
-            .groupby(["year", "mgra", "naics_code"], as_index=False)["value"]
+            .groupby(["year", "mgra", "industry_code"], as_index=False)["value"]
             .sum(),
-            on=["year", "mgra", "naics_code"],
+            on=["year", "mgra", "industry_code"],
             how="left",
         )
         .fillna({"value": 0})
-        .assign(run_id=utils.RUN_ID)[["run_id", "year", "mgra", "naics_code", "value"]]
+        .assign(run_id=utils.RUN_ID)[
+            ["run_id", "year", "mgra", "industry_code", "value"]
+        ]
     )
 
     return jobs
 
 
+def _distribute_self_emp_to_mgra(
+    bg_data: pd.DataFrame, xref: pd.DataFrame
+) -> pd.DataFrame:
+    """Distribute self-employed block group data to MGRA level using allocation
+        percentages.
+
+    Args:
+        bg_data: DataFrame with block group self-employed data. Must include columns:
+            year, blockgroup, industry_code, value
+        xref: Crosswalk DataFrame with columns: blockgroup, mgra, flag, pct_18_64,
+            pct_pop, pct_split
+
+    Returns:
+        Self-Emp Data at MGRA level with columns: run_id, year, mgra, industry_code, value
+    """
+    # Merge block group data to MGRA crosswalk
+    merged = bg_data.merge(xref, on="blockgroup", how="inner")
+
+    # Calculate weighted value based on flag
+    merged = merged.assign(
+        weighted_value=np.select(
+            [
+                merged["flag"] == "pct_18_64",
+                merged["flag"] == "pct_pop",
+                merged["flag"] == "pct_split",
+            ],
+            [
+                merged["value"] * merged["pct_18_64"],
+                merged["value"] * merged["pct_pop"],
+                merged["value"] * merged["pct_split"],
+            ],
+            default=np.nan,
+        )
+    )
+
+    if merged["weighted_value"].isna().any():
+        raise ValueError(
+            "Unexpected allocation flag found; expected one of {'pct_18_64', 'pct_pop', 'pct_split'}"
+        )
+
+    # Group by year, mgra, industry_code and sum, then assign run_id and reorder columns
+    self_emp = (
+        merged.groupby(["year", "mgra", "industry_code"], as_index=False)[
+            "weighted_value"
+        ]
+        .sum()
+        .rename(columns={"weighted_value": "value"})
+        .assign(run_id=utils.RUN_ID)
+    )[["run_id", "year", "mgra", "industry_code", "value"]]
+
+    return self_emp
+
+
 def _get_jobs_inputs(year: int) -> dict[str, pd.DataFrame]:
     """Get input data related to jobs for a specified year.
 
@@ -160,8 +215,8 @@ def _get_jobs_inputs(year: int) -> dict[str, pd.DataFrame]:
     jobs_inputs["lodes_data"] = _get_lodes_data(year)
 
     with utils.GIS_ENGINE.connect() as con:
-        # get crosswalk from Census blocks to MGRAs
-        with open(utils.SQL_FOLDER / "employment/edd_land_use_split.sql") as file:
+        # Get crosswalk from Census blocks to MGRAs
+        with open(utils.SQL_FOLDER / "employment/xref_block_to_mgra.sql") as file:
             jobs_inputs["xref_block_to_mgra"] = utils.read_sql_query_fallback(
                 sql=sql.text(file.read()),
                 con=con,
@@ -172,20 +227,53 @@ def _get_jobs_inputs(year: int) -> dict[str, pd.DataFrame]:
             )
 
     with utils.ESTIMATES_ENGINE.connect() as con:
-        # get regional employment control totals from QCEW
-        with open(utils.SQL_FOLDER / "employment/QCEW_control.sql") as file:
+        # Get regional employment control totals from QCEW
+        with open(utils.SQL_FOLDER / "employment/get_region_qcew.sql") as file:
             jobs_inputs["control_totals"] = utils.read_sql_query_fallback(
                 sql=sql.text(file.read()),
                 con=con,
                 params={
                     "year": year,
                 },
             )
-    jobs_inputs["control_totals"]["run_id"] = utils.RUN_ID
 
-    jobs_inputs["lehd_jobs"] = _aggregate_lodes_to_mgra(
-        jobs_inputs["lodes_data"], jobs_inputs["xref_block_to_mgra"], year
-    )
+        # Get self-employed totals and append to control_totals
+        with open(utils.SQL_FOLDER / "employment/get_region_self_emp.sql") as file:
+            self_emp_control = utils.read_sql_query_fallback(
+                sql=sql.text(file.read()),
+                con=con,
+                params={
+                    "year": year,
+                },
+            )
+
+            jobs_inputs["control_totals"] = pd.concat(
+                [jobs_inputs["control_totals"], self_emp_control],
+                ignore_index=True,
+            )
+
+            jobs_inputs["control_totals"]["run_id"] = utils.RUN_ID
+
+        # Get self-employed block group data
+        with open(utils.SQL_FOLDER / "employment/get_B24080.sql") as file:
+            jobs_inputs["B24080"] = utils.read_sql_query_fallback(
+                sql=sql.text(file.read()),
+                con=con,
+                params={
+                    "year": year,
+                },
+            )
+
+        # Get block group to MGRA crosswalk
+        with open(utils.SQL_FOLDER / "employment/xref_bg_to_mgra.sql") as file:
+            jobs_inputs["xref_bg_to_mgra"] = pd.read_sql_query(
+                sql=sql.text(file.read()),
+                con=con,
+                params={
+                    "run_id": utils.RUN_ID,
+                    "year": year,
+                },
+            )
 
     return jobs_inputs
 
@@ -200,32 +288,39 @@ def _validate_jobs_inputs(jobs_inputs: dict[str, pd.DataFrame]) -> None:
         negative={},
         null={},
     )
+    # Self Employed only includes block groups with self-employed individuals therefore
+    # no row count validation performed
+    tests.validate_data(
+        "Self-employed block group data",
+        jobs_inputs["B24080"],
+        negative={},
+        null={},
+    )
     # No row count validation performed as xref is many-to-many
-    # check
     tests.validate_data(
-        "xref",
+        "xref_block_to_mgra",
         jobs_inputs["xref_block_to_mgra"],
         negative={},
         null={},
     )
+    # No row count validation performed as xref is many-to-many
     tests.validate_data(
-        "QCEW control totals",
-        jobs_inputs["control_totals"],
-        row_count={"key_columns": {"naics_code"}},
+        "xref_bg_to_mgra",
+        jobs_inputs["xref_bg_to_mgra"],
         negative={},
         null={},
     )
     tests.validate_data(
-        "LEHD jobs at MGRA level",
-        jobs_inputs["lehd_jobs"],
-        row_count={"key_columns": {"mgra", "naics_code"}},
+        "QCEW control totals",
+        jobs_inputs["control_totals"],
+        row_count={"key_columns": {"industry_code"}},
         negative={},
         null={},
     )
 
 
 def _create_jobs_output(
-    jobs_inputs: dict[str, pd.DataFrame],
+    jobs_inputs: dict[str, pd.DataFrame], year: int
 ) -> dict[str, pd.DataFrame]:
     """Apply control totals to employment data using utils.integerize_1d().
 
@@ -235,22 +330,35 @@ def _create_jobs_output(
     Returns:
         Controlled employment data.
     """
-    # Sort the input data and get unique naics codes
-    sorted_jobs = jobs_inputs["lehd_jobs"].sort_values(by=["mgra", "naics_code"])
-    naics_codes = sorted_jobs["naics_code"].unique()
+    # Create MGRA level jobs data by combining LODES and self-employment data
+    mgra_jobs = pd.concat(
+        [
+            # Aggregate LODES jobs to MGRA level
+            _aggregate_lodes_to_mgra(
+                jobs_inputs["lodes_data"], jobs_inputs["xref_block_to_mgra"], year
+            ),
+            # Distribute self-employment data to MGRA level
+            _distribute_self_emp_to_mgra(
+                jobs_inputs["B24080"], jobs_inputs["xref_bg_to_mgra"]
+            ),
+        ],
+        ignore_index=True,
+    ).sort_values(by=["mgra", "industry_code"])
 
     # Create list to store controlled values for each industry
     results = []
 
-    # Apply integerize_1d to each naics code
-    for naics_code in naics_codes:
-        # Filter for this naics code
-        naics_mask = sorted_jobs.query("naics_code == @naics_code")
+    # Apply integerize_1d to each industry_code
+    for industry_code in mgra_jobs["industry_code"].unique():
+        # Filter for this industry_code
+        naics_mask = mgra_jobs.loc[mgra_jobs["industry_code"] == industry_code]
 
         # Get control value and apply integerize_1d
         control_value = (
             jobs_inputs["control_totals"]
-            .query("naics_code == @naics_code")["value"]
+            .loc[
+                jobs_inputs["control_totals"]["industry_code"] == industry_code, "value"
+            ]
             .iloc[0]
         )
 
@@ -273,7 +381,7 @@ def _validate_jobs_outputs(jobs_outputs: dict[str, pd.DataFrame]) -> None:
     tests.validate_data(
         "Controlled jobs data",
         jobs_outputs["results"],
-        row_count={"key_columns": {"mgra", "naics_code"}},
+        row_count={"key_columns": {"mgra", "industry_code"}},
         negative={},
         null={},
     )
 
@@ -34,7 +34,12 @@
         "2010_tract": 627,
         "2020_tract": 736,
         "city": 19,
-        "naics_code": 21,
+        # The industry_code is a variable to group employment data into. Almost all
+        # codes are 2-digit naics codes. The 2-digit naics code 72 was split into 721
+        # and 722 3-digit naics code. Self employment data does not natively have a
+        # naics code so it is therefore assigned to 'SE' as the naics codes are being
+        # treated as strings.
+        "industry_code": 22,
     },
     "mgra15": {
         "mgra": 24321,
 
@@ -690,11 +690,11 @@ def read_sql_query_fallback(max_lookback: int = 1, **kwargs: dict) -> pd.DataFra
     returns a message indicating that the year does not exist in the dataset being
     accessed, it will automatically decrement the year by one and re-run the query.
     This process will continue for up to max_lookback years back, or 1 year if not
-    specified. Note this function is specific to querying ACS 5-Year Tables, LEHD LODES,
+    specified. Note this function is specific to querying ACS Tables, LEHD LODES,
     EDD point-level, and QCEW data. This function requires the SQL query file to return
-    a DataFrame with a single column called 'msg' with one of 5 text strings:
-    'ACS 5-Year Table does not exist', 'PUMS 5-Year does not exist',
-    'LODES data does not exist', 'EDD point-level data does not exist', or
+    a DataFrame with a single column called 'msg' with one of 6 text strings: 'ACS
+    5-Year Table does not exist', 'ACS 1-Year Table does not exist', 'PUMS 5-Year does
+    not exist', 'LODES data does not exist', 'EDD point-level data does not exist', or
     'QCEW data does not exist' when no data is found for the specified year.
 
     Args:
@@ -719,6 +719,7 @@ def read_sql_query_fallback(max_lookback: int = 1, **kwargs: dict) -> pd.DataFra
     # Messages that trigger year lookback
     lookback_messages = [
         "ACS 5-Year Table does not exist",
+        "ACS 1-Year Table does not exist",
         "PUMS 5-Year does not exist",
         "LODES data does not exist",
         "EDD point-level data does not exist",