This repository has been archived by the owner on Sep 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 40
/
populate_company_info.sql
296 lines (275 loc) · 10.1 KB
/
populate_company_info.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
/*
For this script, it is necessary to get the data processed by sócios-brasil
in this branch https://github.com/turicas/socios-brasil/tree/novo-formato
Before running this script, make sure to save the acquired files 'empresa.csv',
'socio.csv' and 'establishment.csv' in the 'data/' folder.
*/
-- ------------------------
-- START DATA IMPORTATION
-- ------------------------
DROP TABLE IF EXISTS empresa_temp;
CREATE TABLE "empresa_temp" (
"cnpj_raiz" CHAR(8),
"razao_social" TEXT,
"codigo_natureza_juridica" INTEGER,
"codigo_qualificacao_responsavel" INTEGER,
"capital_social" TEXT,
"codigo_porte" CHAR(7),
"ente_federativo" TEXT
);
COPY empresa_temp FROM '/mnt/data/empresa.csv' DELIMITER ',' CSV HEADER;
DROP TABLE IF EXISTS estabelecimento_temp;
CREATE TABLE "estabelecimento_temp" (
"cnpj_raiz" CHAR(8),
"cnpj_ordem" CHAR(4),
"cnpj_dv" CHAR(2),
"matriz_filial" INTEGER,
"nome_fantasia" TEXT,
"codigo_situacao_cadastral" INTEGER,
"data_situacao_cadastral" CHAR(8),
"codigo_motivo_situacao_cadastral" INTEGER,
"cidade_exterior" TEXT,
"codigo_pais" CHAR(4),
"data_inicio_atividade" CHAR(8),
"cnae_principal" INTEGER,
"cnae_secundaria" TEXT,
"tipo_logradouro" TEXT,
"logradouro" TEXT,
"numero" TEXT,
"complemento" TEXT,
"bairro" TEXT,
"cep" TEXT,
"uf" TEXT,
"codigo_municipio" INTEGER,
"ddd_1" CHAR(4),
"telefone_1" TEXT,
"ddd_2" CHAR(4),
"telefone_2" TEXT,
"ddd_do_fax" CHAR(4),
"fax" TEXT,
"correio_eletronico" TEXT,
"situacao_especial" TEXT,
"data_situacao_especial" CHAR(8)
);
COPY estabelecimento_temp FROM '/mnt/data/estabelecimento.csv' DELIMITER ',' CSV HEADER;
DROP TABLE IF EXISTS socio_temp;
CREATE TABLE "socio_temp" (
"cnpj_raiz" CHAR(8),
"codigo_identificador" INTEGER,
"nome" TEXT,
"cpf_cnpj" CHAR(16),
"codigo_qualificacao" INTEGER,
"data_entrada_sociedade" CHAR(8),
"codigo_pais" CHAR(4),
"representante_cpf_cnpj" CHAR(16),
"representante" TEXT,
"representante_codigo_qualificacao" INTEGER,
"codigo_faixa_etaria" INTEGER
);
COPY socio_temp FROM '/mnt/data/socio.csv' DELIMITER ',' CSV HEADER;
-- -----------------------------------
-- START DATA CLEANING AND FORMATTING
-- -----------------------------------
-- HELPER METHODS
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
CREATE EXTENSION IF NOT EXISTS "unaccent";
CREATE OR REPLACE FUNCTION clean_text(text) RETURNS text
AS $$ select trim(upper(unaccent($1))); $$
LANGUAGE SQL
IMMUTABLE
RETURNS NULL ON NULL INPUT;
CREATE OR REPLACE FUNCTION format_date(text) RETURNS date
AS $$
BEGIN
RETURN TO_DATE($1,'YYYYMMDD');
EXCEPTION
WHEN others THEN RETURN NULL;
END;
$$
LANGUAGE plpgsql
IMMUTABLE
RETURNS NULL ON NULL INPUT;
CREATE OR REPLACE FUNCTION generate_uuid(nome text, cpf text) RETURNS uuid
AS $$
declare
cpf_six_digits text;
formatted_name text;
resp text;
begin
SELECT substring(cpf, 4, 6) INTO cpf_six_digits;
SELECT REPLACE(clean_text(nome), ' ', '-') INTO formatted_name;
SELECT
uuid_generate_v5(uuid_ns_url(), 'https://id.brasil.io/v1/person/' || cpf_six_digits || '-' || formatted_name)
INTO resp;
return resp;
end
$$
LANGUAGE plpgsql
IMMUTABLE
RETURNS NULL ON NULL INPUT;
-- AUXILIARY TABLE WITH CLEANED DATA
DROP TABLE IF EXISTS empresas_socios_temp;
CREATE TABLE "empresas_socios_temp" (
"cnpj_raiz" CHAR(8),
"cnpj_ordem" CHAR(4),
"cnpj_dv" CHAR(2),
"nome_empresa" TEXT,
"cnae_principal" INTEGER,
"cnae_secundaria" TEXT,
"data_inicio_atividade" DATE,
"uf" CHAR(4),
"nome_socio" TEXT,
"cpf_socio" TEXT,
"data_entrada_sociedade" DATE,
"socio_merge_uuid" uuid
);
CREATE INDEX temp_idx_cnpj_raiz ON empresas_socios_temp(cnpj_raiz);
CREATE INDEX temp_idx_cpf_socio ON empresas_socios_temp(cpf_socio);
CREATE INDEX temp_idx_socio_merge_uuid ON empresas_socios_temp(socio_merge_uuid NULLS LAST);
INSERT INTO empresas_socios_temp
SELECT
etabelecimento.cnpj_raiz,
etabelecimento.cnpj_ordem,
etabelecimento.cnpj_dv,
COALESCE(etabelecimento.nome_fantasia, empresa.razao_social) as nome_empresa,
etabelecimento.cnae_principal,
etabelecimento.cnae_secundaria,
format_date(etabelecimento.data_inicio_atividade) as data_inicio_atividade,
etabelecimento.uf,
associados.nome_socio,
associados.cpf_socio,
COALESCE(associados.data_entrada_sociedade, format_date(etabelecimento.data_inicio_atividade)) as data_entrada_sociedade,
generate_uuid(associados.nome_socio, associados.cpf_socio) as socio_merge_uuid
FROM estabelecimento_temp as etabelecimento
LEFT JOIN (
SELECT
empresa_temp.cnpj_raiz,
empresa_temp.razao_social
FROM empresa_temp
) as empresa ON empresa.cnpj_raiz = etabelecimento.cnpj_raiz
LEFT JOIN (
SELECT
socio.cnpj_raiz,
socio.nome as nome_socio,
socio.cpf_cnpj as cpf_socio,
format_date(socio.data_entrada_sociedade) as data_entrada_sociedade
FROM socio_temp as socio WHERE socio.codigo_identificador = 2 AND socio.cpf_cnpj != '***000000**'
UNION
select
empresa_cpf.cnpj_raiz,
empresa_cpf.nome_socio,
empresa_cpf.cpf as cpf_socio,
null as data_entrada_sociedade
FROM
(select
substring(empresa_temp.razao_social from '[0-9]*$') as cpf,
substring(empresa_temp.razao_social from '\D*') as nome_socio,
*
FROM empresa_temp) as empresa_cpf
WHERE (empresa_cpf.cpf = '') IS NOT TRUE
) as associados ON associados.cnpj_raiz = etabelecimento.cnpj_raiz;
-- -----------------------
-- POPULATING MAIN TABLES
-- -----------------------
-- ENRICHING CANDIDATES (owned_companies)
UPDATE core_candidate
SET owned_companies = candidates_companies.companies
FROM (
select candidate_id,
json_agg(companies_owned_by_candidates) as companies
from (
-- search from companies when we have the full cpf of the associate
select core_candidate.id as candidate_id,
cnpj_raiz,
cnpj_ordem,
cnpj_dv,
nome_empresa,
cnae_principal,
cnae_secundaria,
data_inicio_atividade,
uf,
data_entrada_sociedade
from empresas_socios_temp
inner join core_candidate
ON empresas_socios_temp.cpf_socio = core_candidate.taxpayer_id
where (empresas_socios_temp.cpf_socio not like '*%')
and (core_candidate.taxpayer_id = '') IS NOT TRUE
union
-- search from companies when we don't have the full cpf of the associate
-- and need to search by name + cpf (socio_merge_uuid)
select core_candidate.id as candidate_id,
cnpj_raiz,
cnpj_ordem,
cnpj_dv,
nome_empresa,
cnae_principal,
cnae_secundaria,
data_inicio_atividade,
uf,
data_entrada_sociedade
from empresas_socios_temp
inner join core_candidate
ON empresas_socios_temp.socio_merge_uuid = generate_uuid(core_candidate.name, core_candidate.taxpayer_id)
where (empresas_socios_temp.cpf_socio like '*%')
and (core_candidate.taxpayer_id = '') IS NOT TRUE
) as companies_owned_by_candidates
group by companies_owned_by_candidates.candidate_id
) as candidates_companies
where core_candidate.id = candidates_companies.candidate_id;
-- ENRICHING ELECTION INCOME STATEMENT (donor_company_information)
UPDATE core_election_income_statement
-- if a person has more than 1 company only one will be saved :(
SET donor_company_information = to_json(result)
FROM (
-- search data when donor has cnpj
select core_election_income_statement.id as income_id,
cnpj_raiz,
cnpj_ordem,
cnpj_dv,
nome_empresa,
cnae_principal,
cnae_secundaria,
data_inicio_atividade,
uf
from empresas_socios_temp
inner join core_election_income_statement
ON empresas_socios_temp.cnpj_raiz = substring(core_election_income_statement.donor_taxpayer_id, 1, 8)
where LENGTH(core_election_income_statement.donor_taxpayer_id) > 11
union
-- search data when donor has cpf and companies don't have the full cpf of the associate
select core_election_income_statement.id as income_id,
cnpj_raiz,
cnpj_ordem,
cnpj_dv,
nome_empresa,
cnae_principal,
cnae_secundaria,
data_inicio_atividade,
uf
from empresas_socios_temp
inner join core_election_income_statement
ON empresas_socios_temp.socio_merge_uuid = generate_uuid(core_election_income_statement.donor_name, core_election_income_statement.donor_taxpayer_id)
where (empresas_socios_temp.cpf_socio like '*%') and LENGTH(core_election_income_statement.donor_taxpayer_id) = 11
union
-- search data when donor has cpf and companies don't have the full cpf of
-- the associate and we need to search by name + cpf (socio_merge_uuid)
select core_election_income_statement.id as income_id,
cnpj_raiz,
cnpj_ordem,
cnpj_dv,
nome_empresa,
cnae_principal,
cnae_secundaria,
data_inicio_atividade,
uf
from empresas_socios_temp
inner join core_election_income_statement
ON empresas_socios_temp.cpf_socio = core_election_income_statement.donor_taxpayer_id
where (empresas_socios_temp.cpf_socio not like '*%') and LENGTH(core_election_income_statement.donor_taxpayer_id) = 11
) as result
where core_election_income_statement.id = result.income_id;
-- --------------------
-- CLEANING THINGS UP
-- --------------------
DROP TABLE IF EXISTS empresa_temp, estabelecimento_temp, socio_temp, empresas_socios_temp;
DROP FUNCTION IF EXISTS clean_text, format_date, generate_uuid;