Skip to content
This repository was archived by the owner on Jun 30, 2023. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
d1b46a1
quite stuck, df.to_sql isn't working
SashaWeinstein Oct 13, 2022
fe46188
geocoding during build seems to work. Next implement similar process …
SashaWeinstein Oct 14, 2022
70d1e92
HNY_geo populates, but HNY_devdb does not. I suspect some sort of typ…
SashaWeinstein Oct 17, 2022
e5f07f8
format with black
SashaWeinstein Oct 17, 2022
e0bdd28
create HNY_devdb with inner join
SashaWeinstein Oct 18, 2022
3631916
clean print statements and format
SashaWeinstein Oct 18, 2022
9cb2a58
install minio
SashaWeinstein Oct 21, 2022
a568df7
this docker compose doesn't work yet but it's a good start
SashaWeinstein Oct 21, 2022
c7a4690
fix minio installation and add poetry to path
td928 Nov 9, 2022
229fe15
add poetry path to make sure poetry command works
td928 Nov 9, 2022
133cb38
remove import geocoding files and add wait geocoding process
td928 Nov 10, 2022
6415523
dockerfile for actions for poetry path
td928 Nov 10, 2022
a5d1587
add different poetry path
td928 Nov 10, 2022
5a95902
use path that works locally
td928 Nov 10, 2022
f656df5
specify poetry install location
td928 Nov 10, 2022
bcc575d
print path
td928 Nov 10, 2022
32ded07
add path to binary
td928 Nov 10, 2022
b591cda
add poetry installation in test.yml
td928 Nov 10, 2022
df8def1
poetry install
td928 Nov 10, 2022
b12c85e
update python
td928 Nov 10, 2022
00b2438
docker compose status
td928 Nov 10, 2022
7d5ab84
add geosupoort container to actions
td928 Nov 10, 2022
6eaedad
remove sudo and break up steps
td928 Nov 10, 2022
bf526bb
remove docker command
td928 Nov 10, 2022
ef8c361
add postsql install
td928 Nov 10, 2022
316356f
change to port 5432
td928 Nov 10, 2022
d263ba0
remove space and change it back to 25060
td928 Nov 10, 2022
e3ae085
give build engine a different port
td928 Nov 14, 2022
fb1316e
use postgres not local host
td928 Nov 14, 2022
9b19b5b
remove apt update
td928 Nov 14, 2022
2017132
add jq
td928 Nov 14, 2022
27bcf8d
add localhost to arg for postgis image
td928 Nov 14, 2022
4a5ee46
remove docker command
td928 Nov 14, 2022
64e94a4
specify different build engine
td928 Dec 2, 2022
569f843
use localhost
td928 Dec 2, 2022
62b8c4c
use ubuntu latest
td928 Dec 2, 2022
78026ec
specify user and db
td928 Dec 2, 2022
0574380
replace localhost
td928 Dec 2, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@ ARG VERSION_GEO="22.2.2"
FROM nycplanning/docker-geosupport:${VERSION_GEO}

# [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
ARG NODE_VERSION="none"
RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi
# ARG NODE_VERSION="none"
# RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi

## Install postgres
RUN apt-get update
# && export DEBIAN_FRONTEND=noninteractive \
RUN apt-get -y install --no-install-recommends postgresql-client
RUN apt-get -y install --no-install-recommends postgresql-client wget
RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc\
&& chmod +x mc\
&& mv ./mc /usr/bin


# [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
Expand All @@ -26,7 +29,7 @@ RUN apt-get -y install --no-install-recommends postgresql-client

# Install poetry
RUN curl -sSL https://install.python-poetry.org | python3 -
ENV PATH="~/.local/bin:$PATH"
ENV PATH=$HOME/.local/bin:$PATH

RUN /usr/local/bin/python3 -m pip install -U bandit
RUN /usr/local/bin/python3 -m pip install -U black
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
5432
],
// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "apt-get install -y jq && poetry install",
"postCreateCommand": "export PATH=$HOME/.local/bin:$PATH && apt-get install -y jq && poetry install",
// Adding id_rsa so that we can push to github from the dev container
"initializeCommand": "ssh-add $HOME/.ssh/id_rsa"
// export PATH=$PATH:$HOME/.local/bin"
Expand Down
46 changes: 31 additions & 15 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,36 @@ jobs:
github.event_name == 'push' &&
! contains(github.event.head_commit.message, '[skip]')
) || github.event_name != 'push'
runs-on: ubuntu-20.04
runs-on: ubuntu-latest
env:
BUILD_ENGINE: postgresql://postgres:postgres@127.0.0.1:5432/postgres
HED_BUILD_ENGINE: ${{ secrets.HED_BUILD_ENGINE }}
EDM_DATA: ${{ secrets.EDM_DATA }}
AWS_S3_ENDPOINT: ${{ secrets.AWS_S3_ENDPOINT }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
container:
image: nycplanning/docker-geosupport:latest
services:
db:
image: postgis/postgis:12-3.0-alpine
env:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: devdb
POSTGRES_DB: postgres
options: >-
--shm-size=1g
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
ports:
- 25060:5432
env:
BUILD_ENGINE: postgresql://postgres:postgres@localhost:25060/devdb
HED_BUILD_ENGINE: ${{ secrets.HED_BUILD_ENGINE }}
EDM_DATA: ${{ secrets.EDM_DATA }}
AWS_S3_ENDPOINT: ${{ secrets.AWS_S3_ENDPOINT }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- 5432:5432
steps:
- uses: actions/checkout@v2

- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: config workflows
id: config
run: |
Expand All @@ -78,13 +83,21 @@ jobs:

- name: install dependencies ...
run: |
sudo apt update
sudo apt install -y gdal-bin
apt install -y gdal-bin postgresql-client jq

- name: Install and configure minio client ...
run: |
curl -O https://dl.min.io/client/mc/release/linux-amd64/mc
chmod +x mc
sudo mv ./mc /usr/bin
mv ./mc /usr/bin/
mc alias set spaces $AWS_S3_ENDPOINT $AWS_ACCESS_KEY_ID $AWS_SECRET_ACCESS_KEY --api S3v4

- name: Poetry Install
run: |
curl -sSL https://install.python-poetry.org | python3 -
export PATH=$PATH:$HOME/.local/bin
poetry install

- name: 1. dataloading for HED weekly builds
if: >-
steps.config.outputs.weekly == 'yes' &&
Expand All @@ -96,6 +109,9 @@ jobs:
steps.config.outputs.rebuild == 'yes' &&
steps.config.outputs.weekly == 'no'
run: ./devdb.sh dataloading edm && ls -l
env:
POSTGRES_HOST: localhost
POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }}

- name: Clear cache
run: rm -rf .library
Expand Down
35 changes: 35 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# See here for image contents: https://github.com/NYCPlanning/docker-geosupport

# [Choice] Geosupport version
ARG VERSION_GEO="22.2.2"
FROM nycplanning/docker-geosupport:${VERSION_GEO}

# [Choice] Node.js version: none, lts/*, 16, 14, 12, 10
# ARG NODE_VERSION="none"
# RUN if [ "${NODE_VERSION}" != "none" ]; then su vscode -c "umask 0002 && . /usr/local/share/nvm/nvm.sh && nvm install ${NODE_VERSION} 2>&1"; fi

## Install postgres
RUN apt-get update
# && export DEBIAN_FRONTEND=noninteractive \
RUN apt-get -y install --no-install-recommends postgresql-client wget
RUN wget https://dl.min.io/client/mc/release/linux-amd64/mc\
&& chmod +x mc\
&& mv ./mc /usr/bin


# [Optional] If your pip requirements rarely change, uncomment this section to add them to the image.
# COPY requirements.txt /tmp/pip-tmp/
# RUN pip3 --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \
# && rm -rf /tmp/pip-tmp

# [Optional] Uncomment this section to install additional OS packages.
# RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
# && apt-get -y install --no-install-recommends <your-package-list-here>
# RUN apt-get install jq

# Install poetry
RUN curl -sSL https://install.python-poetry.org | python3 -
ENV PATH="$HOME/.local/bin:$PATH"

RUN /usr/local/bin/python3 -m pip install -U bandit
RUN /usr/local/bin/python3 -m pip install -U black
12 changes: 1 addition & 11 deletions bash/01_dataloading.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,19 @@ import_public dcp_firecompanies &
import_public dcp_policeprecincts &
import_public dob_cofos &
import_public dof_shoreline &
import_public hny_geocode_results &

## Geocode results shares index with _geo_devdb
psql $BUILD_ENGINE -c "DROP TABLE IF EXISTS _geo_devdb;"
case $MODE in
weekly)
import_public dob_permitissuance &
import_public dob_jobapplications &
import_public dob_geocode_results &
;;
*)
import_public dob_permitissuance $DOB_DATA_DATE &
import_public dob_jobapplications $DOB_DATA_DATE &
import_public dob_geocode_results $DOB_DATA_DATE &
;;
esac

psql $BUILD_ENGINE -f sql/_create.sql

wait
display "data loading is complete"

psql $BUILD_ENGINE -c "
ALTER TABLE dob_geocode_results
RENAME TO _GEO_devdb;
"
display "data loading is complete"
10 changes: 9 additions & 1 deletion bash/02_build_devdb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ psql $BUILD_ENGINE -f sql/now/_init.sql
psql $BUILD_ENGINE -f sql/_init.sql
count _INIT_devdb

display "Assign geoms to _GEO_devdb and create GEO_devdb"
display "Geocoding DOB records"
poetry run python3 -m python.geocode
wait

# display "Assign geoms to _GEO_devdb and create GEO_devdb"
psql $BUILD_ENGINE -f sql/_geo.sql
psql $BUILD_ENGINE -f sql/_geo_corrections.sql
count GEO_devdb
Expand Down Expand Up @@ -85,6 +89,10 @@ display "Combining _MID_devdb with STATUS_devdb to create MID_devdb,
psql $BUILD_ENGINE -f sql/mid.sql
count MID_devdb

display "Geocoding HNY records"
poetry run python3 -m python.geocode_hny
wait

display "Creating HNY fields:
hny_id,
classa_hnyaff,
Expand Down
5 changes: 5 additions & 0 deletions bash/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ function set_env {
done
}

# set path
export PATH=$PATH:$HOME/.local/bin
export PATH=$PATH:$HOME/.local/share/pypoetry/venv/bin/poetry
echo $PATH

# Setting Environmental Variables
set_env .env version.env
DATE=$(date "+%Y-%m-%d")
Expand Down
36 changes: 36 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
version: "3"

services:
devdb:
build:
context: .
dockerfile: Dockerfile

volumes:
- .:/workspace

# Overrides default command so things don't shut down after the process ends.
command: sleep infinity

# Runs app on the same network as the database container, allows "forwardPorts" in devcontainer.json function.
network_mode: service:db
# Uncomment the next line to use a non-root user for all processes.
# user: vscode

# Use "forwardPorts" in **devcontainer.json** to forward an app port locally.
# (Adding the "ports" property to this file will not forward from a Codespace.)

db:
image: postgis/postgis:11-3.0-alpine
restart: unless-stopped
volumes:
- postgres-data:/var/lib/postgresql/data
environment:
POSTGRES_USER: postgres
POSTGRES_DB: postgres
POSTGRES_PASSWORD: postgres
# Add "forwardPorts": ["5432"] to **devcontainer.json** to forward PostgreSQL locally.
# (Adding the "ports" property to this file will not forward from a Codespace.)

volumes:
postgres-data:
73 changes: 29 additions & 44 deletions python/geocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
from python.utils import psql_insert_copy
import pandas as pd
import os
from tqdm import tqdm
from dotenv import main

main.load_dotenv()

g = Geosupport()

OUTPUT_TABLE_NAME = "_init_geocoded"


def geocode(input):
# collect inputs
Expand Down Expand Up @@ -84,74 +85,58 @@ def parse_output(geo):
)


def load_applications(engine):
def load_init_devdb(engine):
df = pd.read_sql(
"""
SELECT
uid,
regexp_replace(
trim(house_number),
'(^|)0*', '', ''
) as house_number,
REGEXP_REPLACE(street_name, '[\s]{2,}' ,' ' , 'g') as street_name,
borough,
source
FROM (
SELECT
distinct ogc_fid as uid,
housenumber as house_number,
streetname as street_name,
borough,
'bis' as source
FROM dob_jobapplications UNION
SELECT
distinct ogc_fid as uid,
house_no as house_number,
street_name as street_name,
borough,
'now' as source
FROM dob_now_applications
) a LIMIT 400000
job_number,
address_numbr as house_number,
REGEXP_REPLACE(address_street, '[\s]{2,}' ,' ' , 'g') as street_name,
boro as borough
FROM _INIT_devdb
""",
engine,
)

print("loaded df from database")
return df

def geocode_insert_sql(df):
records = df.to_dict("records")

def geocode_insert_sql(records, engine):

# Multiprocess
with Pool(processes=cpu_count()) as pool:
it = tqdm(pool.map(geocode, records, 1000))
# it = tqdm(list(map(geocode, records)))
it = pool.map(geocode, records, len(records) // 4)

df = pd.DataFrame(it)
df.replace({"latitude": {"": None}, "longitude": {"": None}}, inplace=True)
df.to_sql(
"dob_geocode_results",
OUTPUT_TABLE_NAME,
con=engine,
if_exists="append",
index=False,
method=psql_insert_copy,
)


def clear_dob_geocode_results(engine):
engine.execute("DROP TABLE IF EXISTS dob_geocode_results")
engine.execute(f"DROP TABLE IF EXISTS {OUTPUT_TABLE_NAME}")


if __name__ == "__main__":
# connect to BUILD_ENGINE
engine = create_engine(os.environ["BUILD_ENGINE"])

df = load_applications(engine)
clear_dob_geocode_results(engine)
# df = df.iloc[:2000,:]
start =0
chunk_size = 50000
end = chunk_size
while end <= df.shape[0]:
print(f"geocoding records {start} through {end}")
geocode_insert_sql(df.iloc[start:end,:])
start = end
end = min(end+chunk_size, df.shape[0])


df = load_init_devdb(engine)
records = df.to_dict("records")

del df
start = 0
chunk_size = 10**4
end = min(chunk_size, len(records))
while start < len(records):
print(f"geocoding records {start} through {end}")
geocode_insert_sql(records[start:end], engine)
start = end
end = min(end + chunk_size, len(records))
Loading