// Jest Snapshot v1, https://goo.gl/fbAQLP exports[`kosko generate --dev 1`] = ` "--- apiVersion: bitnami.com/v1alpha1 kind: SealedSecret metadata: name: elastic-recherche-entreprises-write annotations: app.gitlab.com/app: socialgouv-recherche-entreprises app.gitlab.com/env: preprod-dev42 app.gitlab.com/env.name: preprod-dev42 labels: application: v1-2-3-recherche-entreprises component: v1-2-3-recherche-entreprises owner: recherche-entreprises team: recherche-entreprises cert: wildcard namespace: recherche-entreprises-85-preprod-dev42 spec: encryptedData: ELASTICSEARCH_URL: >- AgB82d7LnDA/6RxRlks/7cRtmWuz6Ql1p4gCwdghu3X85Ek4FqSL6ui1tIBiuM1pZcaiwM6ZizsKA0ofq5iBafUwRQOzTFc3XAMy7XltrymG/QwBRmYKS4w4Ub1DPuYpVxEUC6Jngyex7OvhCKUK7pugjG6Q8FXO6i9iyVVEpKnAcSVLaUe+olmlOrO2RMjIK3mgKX/xOFT+2FYiN5/LJob+w/+p0hPlZaMsLrLOl/i5N4LuI5ckg+FawifD2MnN057fsLbwt0m63g7ZHvXtGT66tbTcQgpWfy5kLe2m7oIbzdk+oPoh4FS8PnyU6nMC8sOkC3v/GUMK91qCas01RBoyPRTTs11yX3gbYHti5Nc3zDt36YHPhrqfRQHQ8xONYkx5SkAylnDr1JoXyfrKDwZUBvLQ6Xh3gGI7qu909LxZ2ryWd9WRslpB1+8bOLN0tV20slAesGYFC/W6e5GT0AhwWqwJ9usGLf0dM7GE+IXJegIlconcM+2x/FW3RQ5XnK5kI/coiha5pxBkK0p8pbwmLnOwH5c2QoBD5xgLCZ3wleMcbTWdSynzm/LSYkWZzL15M3dy4m8c+qXc88LCbAHTZ2UaAH/XF7pcuMvOF33ZesjgaWLumFoUvhtaG1gZN97eU2/K1xLwo+x/vt06P6vUbU2Emj9cEziTaQqx1ZPUI4Hsp5ZNpbAGRsLMhCf7G7YvxZkpxh+7GAKvW1JmhF1DYgBapCen2bxQN/XK/pNniL89T0hiKSwh/Fo8yDOnqAGova3T1Nq6fLFcRYNxtKFVlsn5zsUgyVPCtyn+cUKIFw== ELASTICSEARCH_API_KEY: >- AgCWc+7OMggEJuNUXxL8CufezAMvJaT3svLkFi9KiKost/avFQNBoXDGixl7vERoCDVvDxnPlcn7m7MIjveP5oIfewhJzYQua9WmQrSMTab1soqUIjMHWj+2A8y0qSYg4s21w/X6bc+H0/O4+ax0QNVtm/MWc30vnCOF+uVibM2WcDkSY/FGE2bE7hcmQWeDbsmnRxDaZTY10ME2ycZpv9eAMoXshCQ98k4LFT9DM0D51az4BibLJjUtsW/vMn/FNKv9/teOGiCAFE/pfoyeZ4QN7ZsoFzCSHwj++Rd0hWScX8VCvbS5pgmfioz8SHmQFrxrN994CAPlr2Rw7mxfkzSETrDgzQVmzKD+j49PZL6cz77cCh9DGE70Pco91szWbsaUQ/lcWUTlFm39Z0Xmf6uV1eCSbyPcM3ogMx95Rjb3MnGa2zrt4OPLMMA2YfyqMtrrVTcq2aJ4fGvWAmVZHH3k2QMcPiYLGHQ8gnVsjI6LYGJEM27RVj99/ppZpoPWeims00fBHMclr3/4czfQCBKcr8GFouxfcKVYGJ4gRNdH0Qn5gZIPXOOiRKUVS0ik0zuT1Xu9u9kU6Df2RVOQawiIOy14+5RrMqt0li1PuCFZjRTqbsLGogOzJQfaXKOafNjxswUqXSn4gR3pVBmm0tPJx+9iFdSrkavvGBrQeSDrNzY5UjeZwIgz9hveUUe1dxARBK8iXu0Q8P6W+VnFActyQs0gnoc4Zzb3jJNfxgDWv1LF/MK22b5G2YcZ3bk5LSsGxP1KICJ0J1MMSL8= template: metadata: annotations: app.gitlab.com/app: socialgouv-recherche-entreprises app.gitlab.com/env: preprod-dev42 app.gitlab.com/env.name: preprod-dev42 name: elastic-recherche-entreprises-write labels: application: v1-2-3-recherche-entreprises component: v1-2-3-recherche-entreprises owner: recherche-entreprises team: recherche-entreprises cert: wildcard type: Opaque --- apiVersion: v1 kind: ConfigMap data: get-data.sh: > #!/bin/bash # borrowed from annuaire entreprise : # https://github.com/etalab/api-annuaire-entreprises/tree/master/db/init geodir=\${DATA_DIR}/geo mkdir -p $geodir echo \\"-- Download datasets\\" for d in \`seq -w 1 19\` 2A 2B \`seq 21 74\` \`seq 76 95\` 98 \\"\\"; do wget --progress=bar:force:noscroll -q --show-progress https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_$d.csv.gz --directory-prefix=$geodir gunzip \${geodir}/geo_siret_$d.csv.gz done #Cas particulier Paris for d in \`seq -w 1 20\`; do wget --progress=bar:force:noscroll -q --show-progress https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_751$d.csv.gz --directory-prefix=$geodir gunzip \${geodir}/geo_siret_751$d.csv.gz done #Cas particulier DOM for d in \`seq -w 1 8\`; do wget --progress=bar:force:noscroll -q --show-progress https://files.data.gouv.fr/geo-sirene/last/dep/geo_siret_97$d.csv.gz --directory-prefix=$geodir gunzip \${geodir}/geo_siret_97$d.csv.gz done # SIRET data wget --progress=bar:force:noscroll -q --show-progress https://files.data.gouv.fr/insee-sirene/StockUniteLegale_utf8.zip --directory-prefix=$DATA_DIR # WEEZ data wget --progress=bar:force:noscroll -q --show-progress https://www.data.gouv.fr/fr/datasets/r/a785345a-6e8c-4961-ae0a-bc00878e4f2e -O \${DATA_DIR}/WEEZ.csv assemble_data.py: | \\"\\"\\"CDTN Entreprises data assembler This script assembles data from different places and creates a new file that will be used as source for our search index. \\"\\"\\" import argparse import pandas as pd import numpy as np from os import listdir from os.path import isfile, join def read_siren(stock_unite_legale_file): \\"\\"\\" Read SIREN Stock Unite Legale Parameters ---------- stock_unite_legale_file: str The location of the CSV or ZIP file Returns ------- employeurs a Pandas dataframe containing the list of all companies that are still open and employ people \\"\\"\\" trancheEffectifsUniteLegale = \\"trancheEffectifsUniteLegale\\" categorieJuridiqueUniteLegale = \\"categorieJuridiqueUniteLegale\\" nomenclatureActivitePrincipaleUniteLegale = \\"nomenclatureActivitePrincipaleUniteLegale\\" categorieEntreprise = \\"categorieEntreprise\\" activitePrincipaleUniteLegale = \\"activitePrincipaleUniteLegale\\" selection = [\\"siren\\", \\"sigleUniteLegale\\", \\"nomUniteLegale\\", \\"nomUsageUniteLegale\\", 'denominationUniteLegale', \\"denominationUsuelle1UniteLegale\\", \\"denominationUsuelle2UniteLegale\\", \\"denominationUsuelle3UniteLegale\\", activitePrincipaleUniteLegale, trancheEffectifsUniteLegale, categorieJuridiqueUniteLegale, nomenclatureActivitePrincipaleUniteLegale, categorieEntreprise] etatAdmin = \\"etatAdministratifUniteLegale\\" caractereEmployeur = \\"caractereEmployeurUniteLegale\\" # we only select columns in use and convert to categorical dtype # in order to decrease the dataframe memory footprint cols = selection + [etatAdmin, caractereEmployeur] raw = pd.read_csv(stock_unite_legale_file, usecols=cols, dtype={ \\"siren\\": np.dtype(str), etatAdmin: \\"category\\", caractereEmployeur: \\"category\\", trancheEffectifsUniteLegale: \\"category\\", categorieJuridiqueUniteLegale: \\"category\\", nomenclatureActivitePrincipaleUniteLegale: \\"category\\", activitePrincipaleUniteLegale: \\"category\\", categorieEntreprise: \\"category\\"}, ) is_ouvert = raw[etatAdmin] == \\"A\\" is_employeur = raw[caractereEmployeur] == \\"O\\" is_admin = raw[etatAdmin] == \\"A\\" employeurs = raw[is_ouvert & is_employeur & is_admin] return employeurs[selection] def read_geo(geo_directory): \\"\\"\\" Read GEO data Parameters ---------- geo_directory: str The directory containing geo data for all regions Returns ------- all_geo a Pandas dataframe containing geo information for all open companies \\"\\"\\" geo_files = [f for f in listdir( geo_directory) if isfile(join(geo_directory, f))] geo_selection = [\\"enseigne1Etablissement\\", \\"enseigne2Etablissement\\", \\"enseigne3Etablissement\\", \\"denominationUsuelleEtablissement\\", \\"activitePrincipaleEtablissement\\", 'siren', 'siret', 'codePostalEtablissement', 'libelleCommuneEtablissement', \\"etatAdministratifEtablissement\\", \\"geo_adresse\\"] geo = {} for file in geo_files: geo[file] = pd.read_csv( geo_directory + file, dtype={\\"codePostalEtablissement\\": np.dtype(str), \\"etatAdministratifEtablissement\\": \\"category\\", \\"activitePrincipaleEtablissement\\": \\"category\\", \\"siret\\": np.dtype(str), \\"siren\\": np.dtype(str), }, usecols=geo_selection ) all_geo = pd.concat(geo.values(), ignore_index=True).dropna( subset=['siret']) all_geo = all_geo.astype(dtype={\\"codePostalEtablissement\\": np.dtype(str), \\"etatAdministratifEtablissement\\": \\"category\\", \\"activitePrincipaleEtablissement\\": \\"category\\", \\"siret\\": np.dtype(str), \\"siren\\": np.dtype(str), }) all_geo = all_geo[all_geo[\\"etatAdministratifEtablissement\\"] == \\"A\\"] return all_geo def read_idcc(idcc_file): \\"\\"\\" Read IDCC data Parameters ---------- idcc_file: str The location of the CSV file containing associations between companies and their \\"convention collectives\\", (aka WEEZ) Returns ------- idccs a Pandas dataframe containing siret / idcc associations \\"\\"\\" idccs = pd.read_csv(idcc_file, dtype={\\"SIRET\\": np.dtype(str)}, usecols=[\\"SIRET\\", \\"IDCC\\"]).rename( columns={\\"SIRET\\": \\"siret\\", \\"IDCC\\": \\"idcc\\"}) return idccs def assemble(siren, geo, idcc, output): sirenGeo = pd.merge(siren, geo, on='siren') merged = pd.merge(sirenGeo, idcc, how='left', on='siret') # add etablissement counts etsCounts = merged.siren.value_counts().rename_axis( 'siren').reset_index(name='etablissements') withEts = pd.merge(merged, etsCounts, on='siren') # persits as CSV file withEts.astype({'idcc': 'Int64'}).to_csv(output) def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( 'siren_file', type=str, help=\\"Location of the StockUniteLegale CSV or ZIP file\\" ) parser.add_argument( 'geo_directory', type=str, help=\\"Location of the directory containing all the Geo CSV files\\" ) parser.add_argument( 'idcc_file', type=str, help=\\"Location of the siret/idcc CSV file (aka WEEZ)\\" ) parser.add_argument( 'output_file', type=str, help=\\"Location of the output file\\" ) args = parser.parse_args() print(\\"Read SIREN data\\") siren = read_siren(args.siren_file) print(\\"Read GEO data\\") geo = read_geo(args.geo_directory) print(\\"Read IDCC data\\") idcc = read_idcc(args.idcc_file) print(\\"Assemble datasets\\") assemble(siren, geo, idcc, args.output_file) if __name__ == \\"__main__\\": main() requirements.txt: | numpy pandas metadata: name: config-map-files-0123456 annotations: app.gitlab.com/app: socialgouv-recherche-entreprises app.gitlab.com/env: preprod-dev42 app.gitlab.com/env.name: preprod-dev42 labels: application: v1-2-3-recherche-entreprises component: v1-2-3-recherche-entreprises owner: recherche-entreprises team: recherche-entreprises cert: wildcard namespace: recherche-entreprises-85-preprod-dev42 --- apiVersion: batch/v1 kind: Job metadata: name: update-index-0123456 annotations: app.gitlab.com/app: socialgouv-recherche-entreprises app.gitlab.com/env: preprod-dev42 app.gitlab.com/env.name: preprod-dev42 labels: application: v1-2-3-recherche-entreprises component: v1-2-3-recherche-entreprises owner: recherche-entreprises team: recherche-entreprises cert: wildcard namespace: recherche-entreprises-85-preprod-dev42 spec: backoffLimit: 3 template: spec: containers: - name: update-index image: >- harbor.fabrique.social.gouv.fr/cdtn/recherche-entreprises-index:1.2.3 volumeMounts: - name: data mountPath: /data env: - name: ASSEMBLY_FILE value: /data/assembly.csv envFrom: - secretRef: name: elastic-recherche-entreprises-write resources: requests: cpu: '1' memory: 14Gi restartPolicy: Never volumes: - name: data emptyDir: {} - configMap: name: config-map-files-0123456 defaultMode: 511 name: local-files initContainers: - args: - '-c' - > apt-get update -y && apt-get install -y wget export DATA_DIR=\\"/data\\" cd /data echo \\"running get-data.sh...\\" /mnt/scripts/get-data.sh pip3 install -r /mnt/scripts/requirements.txt echo \\"running assemble_data.py...\\" python3 /mnt/scripts/assemble_data.py $DATA_DIR/StockUniteLegale_utf8.zip $DATA_DIR/geo/ $DATA_DIR/WEEZ.csv $DATA_DIR/assembly.csv command: - sh image: python:3.9.4 imagePullPolicy: Always name: download-data volumeMounts: - name: data mountPath: /data - mountPath: /mnt/scripts name: local-files metadata: annotations: app.gitlab.com/app: socialgouv-recherche-entreprises app.gitlab.com/env: preprod-dev42 app.gitlab.com/env.name: preprod-dev42 labels: application: v1-2-3-recherche-entreprises component: v1-2-3-recherche-entreprises owner: recherche-entreprises team: recherche-entreprises cert: wildcard " `;