Skip to content

Commit

Permalink
Merge pull request #20 from DS4SD/update_docs
Browse files Browse the repository at this point in the history
Updated documentation: Added convert doc to getting started
  • Loading branch information
dolfim-ibm authored Jul 10, 2022
2 parents 9375502 + c31aae7 commit 990c9a9
Show file tree
Hide file tree
Showing 23 changed files with 569 additions and 352 deletions.
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,15 @@ Python 3.8+
## Start using the toolkit

```console
// Login to Deep Search, see https://ds4sd.github.io/deepsearch-toolkit/getting_started/authentication/
// Login to Deep Search,
// see https://ds4sd.github.io/deepsearch-toolkit/getting_started/#authentication
$ deepsearch login
...


// Convert a document
// for more details, see https://ds4sd.github.io/deepsearch-toolkit/guide/convert_doc/
$ deepsearch documents convert -p 1234567890abcdefghijklmnopqrstvwyz123456 -u https://arxiv.org/pdf/2206.00785.pdf
--------------------------------------------------------------------------------------
Welcome to the Deep Search Toolkit
--------------------------------------------------------------------------------------
Submitting input: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:01<00:00, 1.23s/it]
Converting input: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:25<00:00, 25.61s/it]
Downloading result: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:01<00:00, 1.50s/it]
Expand Down
1 change: 0 additions & 1 deletion deepsearch/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import typer


from deepsearch.core.cli.main import app
from deepsearch.core.cli.plugins import get_cli_groups
from deepsearch.cps.cli.main import app as cps_app
Expand Down
2 changes: 1 addition & 1 deletion deepsearch/cps/cli/data_indices_typer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from deepsearch.core.util.cli_output import OutputEnum, OutputOption, cli_output
from deepsearch.cps.apis.public.rest import ApiException
from deepsearch.cps.cli.cli_options import INDEX_KEY, SOURCE_PATH, PROJ_KEY, URL
from deepsearch.cps.cli.cli_options import INDEX_KEY, PROJ_KEY, SOURCE_PATH, URL
from deepsearch.cps.client.api import CpsApi
from deepsearch.cps.client.components.elastic import ElasticProjectDataCollectionSource
from deepsearch.cps.data_indices import utils
Expand Down
24 changes: 13 additions & 11 deletions deepsearch/cps/client/components/documents.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from pathlib import Path
import os
from pathlib import Path
from typing import List, Optional

import urllib3

from typing import List, Optional
from deepsearch.cps.client.api import CpsApi
from deepsearch.documents.core.convert import (
get_download_url,
download_converted_documents,
get_download_url,
)
from deepsearch.documents.core.create_report import report_docs, report_urls
from deepsearch.cps.client.api import CpsApi

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

Expand Down Expand Up @@ -45,8 +46,9 @@ def download_all(self, result_dir: Path, progress_bar=False):
progress_bar: boolean, optional (default = False)
shows progress bar is True
"""
if not os.path.isdir(result_dir):
os.makedirs(result_dir)

result_dir = Path(result_dir)
result_dir.mkdir(parents=True, exist_ok=True)

urls = get_download_url(
cps_proj_key=self.proj_key, task_ids=self.task_ids, api=self._api
Expand All @@ -64,18 +66,18 @@ def generate_report(
Saves a csv report file for detailed information about the document conversion job.
Returns a dictionary object containing counts of files/urls converted.
"""
if not os.path.isdir(result_dir):
os.makedirs(result_dir)

if self._source_urls == None:
result_dir = Path(result_dir)
result_dir.mkdir(parents=True, exist_ok=True)

if self._source_path is not None:
info = report_docs(
result_dir=result_dir,
task_ids=self.task_ids,
statuses=self.statuses,
source_path=self._source_path,
)

if self._source_path == None:
elif self._source_urls is not None:
info = report_urls(
result_dir=result_dir,
urls=self._source_urls,
Expand Down
25 changes: 13 additions & 12 deletions deepsearch/cps/data_indices/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from pathlib import Path
from typing import Any, List, Optional, Union

import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Expand All @@ -18,6 +19,7 @@
success_message,
)
from deepsearch.documents.core.create_report import report_docs, report_urls
from deepsearch.documents.core.utils import cleanup, create_root_dir

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -68,18 +70,18 @@ def process_url_input(
Individual urls are uploaded for conversion and storage in data index.
"""

root_dir = input_process.create_root_dir()
root_dir = create_root_dir()

# container list for task_ids
task_ids = []
# submit urls
count_urls = len(urls)
with tqdm(
total=count_urls,
desc=f"{'Submitting input:': <{progressbar['padding']}}",
desc=f"{'Submitting input:': <{progressbar.padding}}",
disable=not (progress_bar),
colour=progressbar["colour"],
bar_format=progressbar["bar_format"],
colour=progressbar.colour,
bar_format=progressbar.bar_format,
) as progress:
for url in urls:
payload = {"file_url": url}
Expand All @@ -92,7 +94,7 @@ def process_url_input(
api=api, cps_proj_key=coords.proj_key, task_ids=task_ids
)
print(success_message)
report_urls(root_dir=root_dir, urls=urls, task_ids=task_ids, statuses=statuses)
report_urls(result_dir=root_dir, urls=urls, task_ids=task_ids, statuses=statuses)

return

Expand All @@ -108,7 +110,7 @@ def process_local_file(
"""

# process multiple files from local directory
root_dir = input_process.create_root_dir()
root_dir = create_root_dir()
# batch individual pdfs into zips and add them to root_dir
batched_files = input_process.batch_single_files(
source_path=local_file, root_dir=root_dir
Expand Down Expand Up @@ -136,10 +138,10 @@ def process_local_file(
# start loop
with tqdm(
total=count_total_files,
desc=f"{'Submitting input:': <{progressbar['padding']}}",
desc=f"{'Submitting input:': <{progressbar.padding}}",
disable=not (progress_bar),
colour=progressbar["colour"],
bar_format=progressbar["bar_format"],
colour=progressbar.colour,
bar_format=progressbar.bar_format,
) as progress:
# loop over all files
for single_zip in files_zip:
Expand All @@ -158,11 +160,10 @@ def process_local_file(
)
print(success_message)
report_docs(
root_dir=root_dir,
batched_files=batched_files,
result_dir=root_dir,
task_ids=task_ids,
statuses=statuses,
source_path=local_file,
)
input_process.cleanup(root_dir=root_dir)
cleanup(root_dir=root_dir)
return
15 changes: 9 additions & 6 deletions deepsearch/documents/cli/main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import urllib
from pathlib import Path
import urllib3, urllib

import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import typer

from deepsearch.cps.cli.cli_options import SOURCE_PATH, PROJ_KEY, URL, PROGRESS_BAR
from deepsearch.cps.cli.cli_options import PROGRESS_BAR, PROJ_KEY, SOURCE_PATH, URL
from deepsearch.cps.client.api import CpsApi
from deepsearch.documents.core.main import convert_documents
from deepsearch.documents.core.utils import create_root_dir, get_urls
from deepsearch.cps.client.api import CpsApi

app = typer.Typer(no_args_is_help=True)

Expand Down Expand Up @@ -43,15 +45,16 @@ def convert(
"""
api = CpsApi.default_from_env()

input_urls = None
if urls is not None:
if urllib.parse.urlparse(urls).scheme in ("http", "https"):
urls = [urls]
input_urls = [urls]
else:
urls = get_urls(Path(urls))
input_urls = get_urls(Path(urls))

result = convert_documents(
proj_key=proj_key,
urls=urls,
urls=input_urls,
source_path=source_path,
progress_bar=progress_bar,
api=api,
Expand Down
17 changes: 12 additions & 5 deletions deepsearch/documents/core/common_routines.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from dataclasses import dataclass

dashes = f"{'-'*86}"
WELCOME = f"{dashes}\n{'':>26}Welcome to the Deep Search Toolkit\n{dashes}"
ERROR_MSG = f"{dashes}\nSuggestion:\n(1) Check your input.\n(2) Contact Deep Search developers if problem persists.\n{dashes}"
# setup for progress bars
progressbar_length = 30
progressbar = {
"padding": 22,
"colour": "#0f62fe",
"bar_format": "{l_bar}{bar:%d}{r_bar}{bar:-10b}" % (progressbar_length),
}


@dataclass
class ProgressBarParameters:
padding = 22
colour = "#0f62fe"
bar_format = "{l_bar}{bar:%d}{r_bar}{bar:-10b}" % (progressbar_length)


progressbar = ProgressBarParameters()
success_message = "Whoa... it is done. Until next time, Ciao!"
29 changes: 15 additions & 14 deletions deepsearch/documents/core/convert.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import glob
import logging
import os
import pathlib
from pathlib import Path
from typing import Any, List, Optional
import logging

import requests
import urllib3
from tqdm import tqdm
Expand All @@ -15,7 +16,7 @@
from deepsearch.cps.client.api import CpsApi

from .common_routines import ERROR_MSG, progressbar
from .utils import download_url, URLNavigator
from .utils import URLNavigator, download_url

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -128,10 +129,10 @@ def send_files_for_conversion(
# start loop
with tqdm(
total=count_total_files,
desc=f"{'Submitting input:': <{progressbar['padding']}}",
desc=f"{'Submitting input:': <{progressbar.padding}}",
disable=not (progress_bar),
colour=progressbar["colour"],
bar_format=progressbar["bar_format"],
colour=progressbar.colour,
bar_format=progressbar.bar_format,
) as progress:
# loop over all files
for single_zip in files_zip:
Expand Down Expand Up @@ -167,10 +168,10 @@ def check_status_running_tasks(

with tqdm(
total=count_total,
desc=f"{'Converting input:': <{progressbar['padding']}}",
desc=f"{'Converting input:': <{progressbar.padding}}",
disable=not (progress_bar),
colour=progressbar["colour"],
bar_format=progressbar["bar_format"],
colour=progressbar.colour,
bar_format=progressbar.bar_format,
) as progress:
for task_id in task_ids:
request_status = check_single_task_status(
Expand Down Expand Up @@ -231,10 +232,10 @@ def download_converted_documents(

with tqdm(
total=len(download_urls),
desc=f"{'Downloading result:': <{progressbar['padding']}}",
desc=f"{'Downloading result:': <{progressbar.padding}}",
disable=not (progress_bar),
colour=progressbar["colour"],
bar_format=progressbar["bar_format"],
colour=progressbar.colour,
bar_format=progressbar.bar_format,
) as progress:
count = 1
for url in download_urls:
Expand Down Expand Up @@ -279,10 +280,10 @@ def send_urls_for_conversion(
task_ids = []
with tqdm(
total=count_urls,
desc=f"{'Submitting input:': <{progressbar['padding']}}",
desc=f"{'Submitting input:': <{progressbar.padding}}",
disable=not (progress_bar),
colour=progressbar["colour"],
bar_format=progressbar["bar_format"],
colour=progressbar.colour,
bar_format=progressbar.bar_format,
) as progress:
for url in urls:
task_id = submit_url_for_conversion(
Expand Down
4 changes: 3 additions & 1 deletion deepsearch/documents/core/create_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def report_docs(
report_name = os.path.join(result_dir, "report.csv")

with tempfile.TemporaryDirectory() as tmpdir:
batched_files = batch_single_files(source_path=source_path, root_dir=tmpdir)
batched_files = batch_single_files(
source_path=source_path, root_dir=Path(tmpdir)
)

# batched_files only contains information about single pdfs
# user zips are collected again
Expand Down
7 changes: 2 additions & 5 deletions deepsearch/documents/core/input_process.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import logging
import os
import tempfile
from pathlib import Path
from typing import List

import urllib3


from deepsearch.cps.client.api import CpsApi
from deepsearch.cps.client.components.documents import DocumentConversionResult

from .utils import batch_single_files

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
Expand All @@ -19,8 +18,6 @@
send_urls_for_conversion,
)

logger = logging.getLogger(__name__)


def process_local_input(
api: CpsApi, cps_proj_key: str, source_path: Path, progress_bar=False
Expand All @@ -29,7 +26,7 @@ def process_local_input(
Classify the user provided local input and take appropriate action.
"""
if not os.path.exists(source_path):
logger.error("Error: File not found. Check input.")
raise ValueError("File not found. Check input.")
else:
with tempfile.TemporaryDirectory() as tmpdir:
batched_files = batch_single_files(
Expand Down
7 changes: 5 additions & 2 deletions deepsearch/documents/core/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import urllib
from pathlib import Path
from typing import List, Optional, Union
import urllib, urllib3

import urllib3

from deepsearch.cps.client.api import CpsApi
from deepsearch.documents.core.input_process import (
process_local_input,
Expand All @@ -12,7 +15,7 @@

def convert_documents(
proj_key: str,
api: CpsApi = None,
api: CpsApi,
urls: Optional[Union[str, List[str]]] = None,
source_path: Optional[Path] = None,
progress_bar=False,
Expand Down
Loading

0 comments on commit 990c9a9

Please sign in to comment.