Skip to content

Commit

Permalink
perf(sinan): remove unnecessary cwd's in FTP_SINAN (#123)
Browse files Browse the repository at this point in the history
* perf(sinan): remove unnecessary cwd's in FTP_SINAN

* Caching the login with datasus

* rolling back to other dbs, keeping sinan

* OSError: [Errno 99] Cannot assign requested address
  • Loading branch information
luabida authored Mar 28, 2023
1 parent 1a45981 commit 5199685
Showing 1 changed file with 20 additions and 40 deletions.
60 changes: 20 additions & 40 deletions pysus/online_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@
"CIHA": ["/dissemin/publicos/CIHA/201101_/Dados"],
}

def FTP_datasus():
ftp = FTP("ftp.datasus.gov.br")
ftp.login()
return ftp


def cache_contents():
"""
Expand All @@ -74,8 +79,8 @@ def parquets_to_dataframe(
at time.
"""

parquets = Path(parquet_dir).glob("*.parquet")

parquets = list(map(str, Path(parquet_dir).glob("*.parquet")))
try:
chunks_list = [
pd.read_parquet(str(f), engine="fastparquet") for f in parquets
Expand Down Expand Up @@ -180,8 +185,7 @@ def last_update_df(self) -> pd.DataFrame: # Legacy
)
return pd.DataFrame()

with FTP("ftp.datasus.gov.br") as ftp:
ftp.login()
with self.ftp_server.login() as ftp:
response = {
"folder": [],
"date": [],
Expand Down Expand Up @@ -306,10 +310,10 @@ def list_all(
chunks, to preserve memory, that are read using pandas and pyarrow.
"""
available_dbs = list()
ftp = FTP("ftp.datasus.gov.br")
ftp.login()
for path in self._ds_paths:
try:
ftp = FTP("ftp.datasus.gov.br")
ftp.login()
# CNES
if self.database == "CNES":
if not CNES_group:
Expand Down Expand Up @@ -349,8 +353,6 @@ def list_all(
)
except Exception as e:
raise e
finally:
FTP("ftp.datasus.gov.br").close()
return available_dbs


Expand Down Expand Up @@ -495,7 +497,7 @@ def url_regex(
if not year or not month or not UF:
raise ValueError("Missing year(s), month(s) or UF(s)")
file_pattern = re.compile(
rf"{SIA_group}{UF}{year}{month}.dbc", re.I
rf"{SIA_group}{UF}{year}{month}[abc]?.dbc", re.I
)
elif db == "PNI":
if not year or not UF:
Expand Down Expand Up @@ -535,7 +537,7 @@ def _extract_dbc(self, DBC_path: str, local_dir: str = cache_dir) -> str:
if Path(filepath).exists():
return str(filepath)
try:
ftp = FTP("ftp.datasus.gov.br")
ftp = ftp = FTP("ftp.datasus.gov.br")
ftp.login()
ftp.cwd(filedir)
ftp.retrbinary(
Expand All @@ -546,8 +548,6 @@ def _extract_dbc(self, DBC_path: str, local_dir: str = cache_dir) -> str:
except error_perm as e:
logging.error(f"Not able to download {filename}")
raise e
finally:
ftp.close()

def _dbfc_to_parquets(self, fpath: str, local_dir: str) -> str(PosixPath):
"""DBC/DBF files to parquets using Pandas & PyArrow"""
Expand Down Expand Up @@ -649,6 +649,10 @@ class FTP_SINAN:

def __init__(self, name: str) -> None:
self.name = self.__diseasecheck__(name)
ftp = FTP_datasus()
code = self.diseases[self.name]
self.finals = ftp.nlst(f"{DB_PATHS['SINAN'][0]}/{code}BR*.dbc")
self.prelims = ftp.nlst(f"{DB_PATHS['SINAN'][1]}/{code}BR*.dbc")

def __diseasecheck__(self, name: str) -> str:
return (
Expand Down Expand Up @@ -681,9 +685,8 @@ def extract_years(paths):
for path in paths
]

p = self._ftp_list_datasets_paths
prelim_years = extract_years(p(self.name, "prelim"))
finais_years = extract_years(p(self.name, "finais"))
prelim_years = extract_years(self.prelims)
finais_years = extract_years(self.finals)

if stage == "prelim":
return sorted(prelim_years)
Expand All @@ -698,9 +701,8 @@ def get_ftp_paths(self, years: list) -> list:
is not available, it won't be included
in the result
"""
p = self._ftp_list_datasets_paths
prelim_paths = p(self.name, "prelim")
finais_paths = p(self.name, "finais")
prelim_paths = self.prelims
finais_paths = self.finals
all_paths = prelim_paths + finais_paths
ds_paths = list()

Expand All @@ -711,25 +713,3 @@ def mask(_year):
[ds_paths.append(path) for path in all_paths if mask(year) in path]

return ds_paths

def _ftp_list_datasets_paths(self, disease: str, stage: str) -> list:
"""
stage: 'f'|'finais' or 'p'|'prelim'
"""
datasets_path = "/dissemin/publicos/SINAN/DADOS/"

if stage.startswith("f"):
datasets_path += "FINAIS"
elif stage.startswith("p"):
datasets_path += "PRELIM"
else:
raise ValueError(f"{stage}")

code = self.diseases[disease]

ftp = FTP("ftp.datasus.gov.br")
ftp.login()
ftp.cwd(datasets_path)
available_dbcs = ftp.nlst(f"{code}BR*.dbc")

return [f"{ftp.pwd()}/{dbc}" for dbc in available_dbcs]

0 comments on commit 5199685

Please sign in to comment.