diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 10df3a1..56aecc9 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -3,5 +3,4 @@ FROM mcr.microsoft.com/devcontainers/python:3.12-bookworm # Uninstall pre-installed formatting and linting tools # They would conflict with our pinned versions -RUN pipx uninstall flake8 RUN pipx uninstall mypy diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 9f84f0c..820411a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -8,23 +8,15 @@ "vscode": { "settings": { "python.pythonPath": "/usr/local/bin/python", - "python.defaultInterpreterPath": "/usr/local/bin/python", - "python.languageServer": "Pylance", - "flake8.path": [ - "/usr/local/py-utils/bin/bandit", - "/usr/local/py-utils/bin/pydocstyle" - ], - "flake8.importStrategy": "fromEnvironment" + "python.defaultInterpreterPath": "/usr/local/bin/python" }, "extensions": [ "AykutSarac.jsoncrack-vscode", + "charliermarsh.ruff", "eamodio.gitlens", "Gruntfuggly.todo-tree", "matangover.mypy", - "ms-python.flake8", - "ms-python.isort", "ms-python.mypy-type-checker", - "ms-python.pylint", "ms-python.python", "ms-python.vscode-pylance", "njpwerner.autodocstring", diff --git a/.flake8 b/.flake8 deleted file mode 100644 index eb78037..0000000 --- a/.flake8 +++ /dev/null @@ -1,11 +0,0 @@ -# Used by cli and vscode -[flake8] -max-line-length=120 -max-complexity = 10 -exclude = - venv, - build, - dist -per-file-ignores = - __init__.py: D100, D101, D102, D103, D104, D106, D107, F401 - /*: D100, D101, D102, D103, D104, D106, D107 diff --git a/.github/flake8-error-problem-matcher.json b/.github/flake8-error-problem-matcher.json deleted file mode 100644 index a5ba306..0000000 --- a/.github/flake8-error-problem-matcher.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "problemMatcher": [ - { - "owner": "flake8-error", - "severity": "error", - "pattern": [ - { - "regexp": "^([^:]+):(\\d+):(\\d+):\\s+(E\\d+\\s+.+)$", - "file": 1, - "line": 2, - "column": 3, - "message": 4 - } - ] - } - ] -} \ No newline at end of file diff --git a/.github/flake8-warning-problem-matcher.json b/.github/flake8-warning-problem-matcher.json deleted file mode 100644 index dcffa07..0000000 --- a/.github/flake8-warning-problem-matcher.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "problemMatcher": [ - { - "owner": "flake8-warning", - "severity": "warning", - "pattern": [ - { - "regexp": "^([^:]+):(\\d+):(\\d+):\\s+([CFNW]\\d+\\s+.+)$", - "file": 1, - "line": 2, - "column": 3, - "message": 4 - } - ] - } - ] -} \ No newline at end of file diff --git a/.github/workflows/01-ci.yml b/.github/workflows/01-ci.yml new file mode 100644 index 0000000..1ed66c6 --- /dev/null +++ b/.github/workflows/01-ci.yml @@ -0,0 +1,80 @@ +name: 01 | ai-data-preprocessing-queue | continuous integration +run-name: State ${{ github.ref }} +on: + push: + branches: + - "**" + paths-ignore: + - README.md + - LICENSE.txt + - .gitignore + - .vscode/** + - .devcontainer/** + +jobs: + changes: + name: changes + runs-on: ubuntu-latest + timeout-minutes: 1 + outputs: + ai_data_preprocessingqueue: ${{ steps.filter.outputs.ai_data_preprocessing_queue }} + first_run: ${{ steps.first_run.outputs.first_run }} + steps: + - uses: actions/checkout@v4 + - uses: SamhammerAG/first-build-action@v2.1 + id: first_run + with: + workflow: 01 | ai-data-preprocessing-queue | continuous integration + branch: ${{ github.ref_name }} + - uses: dorny/paths-filter@v3 + id: filter + with: + base: ${{ github.ref }} + filters: | + ai-data-preprocessing-queue: + - '.github/workflows/01-ci.yml' + - 'ai-data-preprocessing-queue/**' + + build_python: + name: Python | build + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.12 + - name: preparation | Add linting matchers + run: | + echo "::add-matcher::.github/mypy-error-problem-matcher.json" + - name: preparation | upgrade pip + if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true' + run: python -m pip install --upgrade pip + - name: ai-data-preprocessing-queue | Install dependencies + if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true' + run: pip install -r requirements-dev.txt + - name: preparation | Create failures dir + run: mkdir -p ./ruff_failures + - name: ai-data-preprocessing-queue | ruff check + if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true' + run: | + ruff check --output-format github . || echo "ai_data_preprocessing_queue" >> ./ruff_failures/ruff_failures.txt + - name: ai-data-preprocessing-queue | Lint typing + if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true' + # Must run with installed dependencies + # Must be executed from root with path to project as param because + # the problem matcher needs an absolute path in error message to work correctly + working-directory: ./ + run: | + mypy --config-file .mypy.ini --show-column-numbers ai-data-preprocessing-queue + - name: Test ai-data-preprocessing-queue + if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true' + working-directory: ai-data-preprocessing-queue + run: pytest --cov app --cov-report=xml tests + - name: ai-data-preprocessing-queue | Report coverage + if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true' + uses: MishaKav/pytest-coverage-comment@main + with: + title: Test coverage for changes on ai-data-preprocessing-queue + pytest-xml-coverage-path: ai-data-preprocessing-queue/coverage.xml + report-only-changed-files: true \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/02-publish.yml similarity index 89% rename from .github/workflows/publish.yml rename to .github/workflows/02-publish.yml index 4ed3537..4cad8d2 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/02-publish.yml @@ -8,8 +8,8 @@ jobs: name: Build and/or publish Python 🐍 distributions 📦 to PyPI runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: "3.12" - run: >- diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index c7bb5db..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: ci -run-name: CI ${{ github.ref }} -on: - push: - branches: - - "**" - paths-ignore: - - README.md - - LICENSE.txt - - .gitignore - - .vscode/** - - .devcontainer/** - -jobs: - build_python: - name: Python | build - runs-on: ubuntu-latest - timeout-minutes: 5 - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - with: - python-version: "3.12" - - name: Add linting matchers - run: | - echo "::add-matcher::.github/flake8-error-problem-matcher.json" - echo "::add-matcher::.github/flake8-warning-problem-matcher.json" - echo "::add-matcher::.github/mypy-error-problem-matcher.json" - - name: Lint - uses: py-actions/flake8@v2 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-dev.txt - - name: Lint typing - # Must run with installed dependencies - run: | - mypy --config-file .mypy.ini --show-column-numbers . - - name: Run tests - run: python -m pytest --cov=ai_data_preprocessing_queue --cov-report=xml tests - - name: Prepare coverage file - run: coverage-lcov - - name: Coveralls upload - uses: coverallsapp/github-action@master - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - path-to-lcov: lcov.info diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index b5e7249..0000000 --- a/.pylintrc +++ /dev/null @@ -1,5 +0,0 @@ -[FORMAT] -max-line-length=120 - -[MESSAGES CONTROL] -disable=C0114, C0115, C0116 \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index 8bce6fd..6818dff 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,13 +9,11 @@ "files.trimTrailingWhitespace": true, "files.exclude": { ".coverage": true, - ".flake8": true, ".gitattributes": true, ".gitignore": true, ".idea": true, ".mypy_cache": true, ".mypy.ini": true, - ".pylintrc": true, "*.egg-info": true, "**/__pycache__": true, "**/.pytest_cache": true, @@ -29,8 +27,8 @@ "python.testing.pytestEnabled": true, "python.testing.unittestEnabled": false, "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", - "editor.formatOnSave": true + "editor.formatOnSave": true, + "editor.defaultFormatter": "charliermarsh.ruff" }, "python.analysis.diagnosticSeverityOverrides": { "reportPrivateUsage": "information", diff --git a/.vscode/tasks.json b/.vscode/tasks.json index 3ebb31f..6410984 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -14,7 +14,17 @@ { "label": "Lint", "type": "shell", - "command": "flake8 . --count --statistics; mypy .", + "command": "ruff check; mypy .", + "group": "test", + "presentation": { + "reveal": "always", + "panel": "new" + } + }, + { + "label": "Format all", + "type": "shell", + "command": "ruff format", "group": "test", "presentation": { "reveal": "always", diff --git a/ai_data_preprocessing_queue/Steps/language_detect.py b/ai_data_preprocessing_queue/Steps/language_detect.py index 9bc19c6..1555b79 100644 --- a/ai_data_preprocessing_queue/Steps/language_detect.py +++ b/ai_data_preprocessing_queue/Steps/language_detect.py @@ -1,11 +1,11 @@ -""" -Detects one of the following languages and writes the language to local state. +"""Detects one of the following languages and writes the language to local state. af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he, hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl, pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw """ + from typing import Any from langdetect import detect diff --git a/ai_data_preprocessing_queue/Steps/remove_numbers.py b/ai_data_preprocessing_queue/Steps/remove_numbers.py index db129bb..eb3dd10 100644 --- a/ai_data_preprocessing_queue/Steps/remove_numbers.py +++ b/ai_data_preprocessing_queue/Steps/remove_numbers.py @@ -3,5 +3,4 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any: - item = re.sub(r"""\d""", " ", item) - return item + return re.sub(r"""\d""", " ", item) diff --git a/ai_data_preprocessing_queue/Steps/remove_punctuation.py b/ai_data_preprocessing_queue/Steps/remove_punctuation.py index 7e94d65..da0de26 100644 --- a/ai_data_preprocessing_queue/Steps/remove_punctuation.py +++ b/ai_data_preprocessing_queue/Steps/remove_punctuation.py @@ -3,5 +3,4 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any: - item = re.sub(r"[^\w\s]", " ", item) - return item + return re.sub(r"[^\w\s]", " ", item) diff --git a/ai_data_preprocessing_queue/Steps/remove_signature.py b/ai_data_preprocessing_queue/Steps/remove_signature.py index 9b73b8e..cc4eb35 100644 --- a/ai_data_preprocessing_queue/Steps/remove_signature.py +++ b/ai_data_preprocessing_queue/Steps/remove_signature.py @@ -6,15 +6,30 @@ def remove_newline(text: str) -> str: """Remove excessive newlines or spaces from the text.""" pattern = re.compile(r"\s{2,}|[\n\r]{3,}") result = pattern.sub(" ", text) - result = re.sub(r"\s+", " ", result).strip() - - return result - - -GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers", - "regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße", - "herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse", - "beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"] + return re.sub(r"\s+", " ", result).strip() + + +GreetingExpressions = [ + "sincerely", + "best regards", + "happy holidays", + "kind regards", + "warm regards", + "cheers", + "regards", + "mit freundlichen grüßen", + "freundliche grüße", + "beste grüße", + "viele grüße", + "herzliche grüße", + "liebe grüße", + "mit freundlichen grüssen", + "freundliche grüsse", + "beste grüsse", + "viele grüsse", + "herzliche grüsse", + "liebe grüsse", +] greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*" @@ -26,15 +41,15 @@ def remove_greetings_and_following_text(text: str) -> str: # thank you expressions should be removed after greetings and following signature text, # as they often appear at the beginning of a message THANK_EXPRESSIONS = [ - r"thank you(?: very much)?", # thank you, thank you very much - r"thankyou(?: very much)?", # thankyou, thankyou very much - r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again - r"many thanks", # many thanks - r"a thousand thanks", # a thousand thanks - r"danke(?: schön)?", # danke, danke schön, danke und - r"vielen dank", # vielen dank - r"dankeschön", # dankeschön - r"besten dank" # besten dank + r"thank you(?: very much)?", # thank you, thank you very much + r"thankyou(?: very much)?", # thankyou, thankyou very much + r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again + r"many thanks", # many thanks + r"a thousand thanks", # a thousand thanks + r"danke(?: schön)?", # danke, danke schön, danke und + r"vielen dank", # vielen dank + r"dankeschön", # dankeschön + r"besten dank", # besten dank ] # Suffixes which could follow thank you expressions @@ -43,15 +58,13 @@ def remove_greetings_and_following_text(text: str) -> str: r"(?:for (?:your|the) (?:help|support|understanding|assistance))", r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)", r"vorab", - r"kindly?" + r"kindly?", ] # Combine them into a final regex pattern and compile thank_expressions = r"|".join(THANK_EXPRESSIONS) suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?" -final_pattern = ( - r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*" -) +final_pattern = r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*" thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE) diff --git a/ai_data_preprocessing_queue/Steps/spellcheck.py b/ai_data_preprocessing_queue/Steps/spellcheck.py index 252e400..7cf31ae 100644 --- a/ai_data_preprocessing_queue/Steps/spellcheck.py +++ b/ai_data_preprocessing_queue/Steps/spellcheck.py @@ -30,9 +30,12 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | N all_words_to_check: Any = reduce(lambda x, y: cast(str, x) + cast(str, y), items) for w in all_words_to_check: - if len(item_word) < 4 and _levenshtein(item_word, w) == 1: - item = item.replace(item_word, w) - elif len(item_word) >= 4 and 1 <= _levenshtein(item_word, w) <= 2: + if ( + len(item_word) < 4 # noqa: PLR2004 + and _levenshtein(item_word, w) == 1 + or len(item_word) >= 4 # noqa: PLR2004 + and 1 <= _levenshtein(item_word, w) <= 2 # noqa: PLR2004 + ): item = item.replace(item_word, w) return item diff --git a/ai_data_preprocessing_queue/Steps/text_only.py b/ai_data_preprocessing_queue/Steps/text_only.py index a884ad5..e26814e 100644 --- a/ai_data_preprocessing_queue/Steps/text_only.py +++ b/ai_data_preprocessing_queue/Steps/text_only.py @@ -4,5 +4,4 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any: item = re.sub(r"[^\w\s]", " ", item) - item = re.sub(r"""\d""", " ", item) - return item + return re.sub(r"""\d""", " ", item) diff --git a/ai_data_preprocessing_queue/Steps/token_replacement.py b/ai_data_preprocessing_queue/Steps/token_replacement.py index 0c12a85..cd74764 100644 --- a/ai_data_preprocessing_queue/Steps/token_replacement.py +++ b/ai_data_preprocessing_queue/Steps/token_replacement.py @@ -15,7 +15,7 @@ def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | N # also replace dots at end of word if not line[0].endswith("."): - regex = regex + "\\b" + regex += "\\b" pattern = re.compile(regex) item = pattern.sub(line[1], item) @@ -38,18 +38,16 @@ def _get_data_from_store_or_reload(global_state: dict[str, Any] | None, preproce def _prepare_pre_processor_data(preprocessor_data: str) -> list[list[str]]: lines: list[list[str]] = [ - [s.strip() for i, s in enumerate(line.split(",")) if (i == 2 and re.compile(r"^[0-9\s]+$").match(s)) or i < 2] + [s.strip() for i, s in enumerate(line.split(",")) if (i == 2 and re.compile(r"^[0-9\s]+$").match(s)) or i < 2] # noqa: PLR2004 for line in preprocessor_data.splitlines() - if line.count(",") == 2 + if line.count(",") == 2 # noqa: PLR2004 ] - lines = [line for line in lines if len(line) == 3] + lines = [line for line in lines if len(line) == 3] # noqa: PLR2004 i: int = 0 while i < len(lines): lines[i][2] = int(lines[i][2]) # type: ignore i += 1 - # sort - lines = sorted(lines, key=lambda f: 0 - f[2]) # type: ignore - - return lines + # sort and return + return sorted(lines, key=lambda f: 0 - f[2]) # type: ignore diff --git a/ai_data_preprocessing_queue/__init__.py b/ai_data_preprocessing_queue/__init__.py index af8c710..7a99790 100644 --- a/ai_data_preprocessing_queue/__init__.py +++ b/ai_data_preprocessing_queue/__init__.py @@ -1 +1 @@ -from .Pipeline import Pipeline +from .Pipeline import Pipeline as Pipeline diff --git a/pyproject.toml b/pyproject.toml index 0f3470b..6204dd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ai-data-preprocessing-queue" -version = "1.7.1" +version = "1.7.2" description = "A collection of different text processing steps that can be enabled or disabled dynamically." authors = ["KI-Team"] license = "MIT" @@ -16,8 +16,6 @@ pandas = "*" [tool.poetry.group.dev.dependencies] build = "*" coverage-lcov = "*" -flake8-bandit = "*" -flake8-pydocstyle = "*" mypy = "*" parameterized = "*" pytest = "*" @@ -27,3 +25,100 @@ types-mock = "*" [build-system] requires = ["setuptools>=61.0"] build-backend = "setuptools.build_meta" + +[tool.ruff] +line-length = 120 +# Exclude a variety of commonly ignored directories. +exclude = [ + ".bzr", + ".direnv", + ".eggs", + ".env", + ".git", + ".git-rewrite", + ".hg", + ".ipynb_checkpoints", + ".mypy_cache", + ".nox", + ".pants.d", + ".pyenv", + ".pytest_cache", + ".pytype", + ".ruff_cache", + ".svn", + ".tox", + ".venv", + ".vscode", + "__pypackages__", + "_build", + "buck-out", + "build", + "dist", + "node_modules", + "site-packages", + "venv", +] + + +[tool.ruff.format] +quote-style = "double" + +[tool.ruff.lint.pylint] +max-nested-blocks = 6 +max-args = 8 +max-positional-args = 8 + +[tool.ruff.lint] +preview = true +select = [ + "ASYNC", # Async: https://docs.astral.sh/ruff/rules/#flake8-async-async + "B", # Flake8 Bugbear: https://docs.astral.sh/ruff/rules/#flake8-bugbear-b + "C901", # complex-structure + "D", # Docstyle: https://docs.astral.sh/ruff/rules/#pydocstyle-d + "DOC", # Pydoclint: https://docs.astral.sh/ruff/rules/#pydoclint-doc + "E", # Errors: https://docs.astral.sh/ruff/rules/#error-e + "F", # Flakes: https://docs.astral.sh/ruff/rules/#pyflakes-f + "G", # Logging format: https://docs.astral.sh/ruff/rules/#flake8-logging-format-g + "I", # Isort: https://docs.astral.sh/ruff/rules/#isort-i + "ISC001", # single-line-implicit-string-concatenation + "N", # Naming: https://docs.astral.sh/ruff/rules/#pep8-naming-n + "PL", # Pylint: https://docs.astral.sh/ruff/rules/#pylint-pl + "Q", # Quotes: https://docs.astral.sh/ruff/rules/#flake8-quotes-q + "RET", # Return: https://docs.astral.sh/ruff/rules/#flake8-return-ret + "SIM", # Simplify: https://docs.astral.sh/ruff/rules/#flake8-simplify-sim + "SLF", # Private member access: https://docs.astral.sh/ruff/rules/#flake8-self-slf + "UP", # Upgrade: https://docs.astral.sh/ruff/rules/#pyupgrade-up + "W", # Warnings: https://docs.astral.sh/ruff/rules/#warning-w +] +ignore = [ + "B017", # Do not assert blind exception: `Exception` + "D100", # undocumented-public-module + "D101", # undocumented-public-class + "D102", # undocumented-public-method + "D103", # undocumented-public-function + "D104", # undocumented-public-package + "D105", # undocumented-magic-method + "D106", # undocumented-public-nested-class + "D107", # undocumented-public-init + "D203", # one blank line required before class docstring + "D213", # multi-line summary starts on second line + "DOC201", # docstring-missing-returns + "DOC501", # docstring-missing-exception" + "ISC001", # single-line-implicit-string-concatenation + "N818", # error-suffix-on-exception-name + "N999", # invalid-module-name +] + +[tool.ruff.lint.per-file-ignores] +"tests/**" = [ + "D400", # first-line-should-end-with-period + "D415", # first-line-should-end-with + "PLC2701", # import-private-name + "PLR2004", # magic-value-comparison + "PLR6301", # static method + "PLR0904", # too-many-public-methods + "SLF001", # private-member-access + "N802", # invalid-function-name + "N818", # invalid-class-name + "N999", # invalid-module-name +] \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 0e1d798..0c43865 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,10 +1,9 @@ -r requirements.txt coverage-lcov build -flake8-bandit -flake8-pydocstyle mypy parameterized pytest pytest-cov +ruff diff --git a/setup.py b/setup.py index 73d1de0..2d5601d 100644 --- a/setup.py +++ b/setup.py @@ -1,17 +1,17 @@ import setuptools -with open("README.md", "r") as fh: +with open("README.md", encoding="utf-8") as fh: LONG_DESCRIPTION = fh.read() -with open("requirements.txt", "r") as fin: +with open("requirements.txt", encoding="utf-8") as fin: REQS = fin.read().splitlines() -with open("requirements-dev.txt", "r") as fin: +with open("requirements-dev.txt", encoding="utf-8") as fin: REQS_DEV = [item for item in fin.read().splitlines() if not item.endswith(".txt")] setuptools.setup( name="ai-data-preprocessing-queue", - version="1.7.1", + version="1.7.2", description="Can be used to pre process data before ai processing", long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 9a8c1e0..bd53d5b 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -48,7 +48,7 @@ def test_regex_replacement_do_not_crash_for_no_data(self) -> None: self.assertEqual("test text", value) def test_regex_replacement(self) -> None: - with open(path.join(ABS_PATH_TEST_DATA, "regex_replacement_testdata.csv"), "r", encoding="utf-8") as handler: + with open(path.join(ABS_PATH_TEST_DATA, "regex_replacement_testdata.csv"), encoding="utf-8") as handler: pipeline = Pipeline({"regex_replacement": handler.read()}) # date value = pipeline.consume("test 1.1.2019 20.2.2003 1.1.20 01.01.20 1.1.1900 1.1. 01.01. test") @@ -88,19 +88,19 @@ def test_token_replacement_do_not_crash_for_no_data(self) -> None: self.assertEqual("test text", value) def test_token_replacement(self) -> None: - with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), "r", encoding="utf-8") as handler: + with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), encoding="utf-8") as handler: pipeline = Pipeline({"token_replacement": handler.read()}) value = pipeline.consume("test asd bla 1212") self.assertEqual("test www blub 1212", value) def test_token_replacement_do_not_replace_parts_of_word(self) -> None: - with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), "r", encoding="utf-8") as handler: + with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), encoding="utf-8") as handler: pipeline = Pipeline({"token_replacement": handler.read()}) value = pipeline.consume("test abg. abgabgeschlossen 1212") self.assertEqual("test abgeschlossen abgabgeschlossen 1212", value) def test_token_replacement_also_replace_dots_at_end_of_phrase(self) -> None: - with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), "r", encoding="utf-8") as handler: + with open(path.join(ABS_PATH_TEST_DATA, "token_replacement_testdata.csv"), encoding="utf-8") as handler: pipeline = Pipeline({"token_replacement": handler.read()}) value = pipeline.consume("abg. 1212") self.assertEqual("abgeschlossen 1212", value) diff --git a/tests/test_remove_signature.py b/tests/test_remove_signature.py index b048a58..9d91f95 100644 --- a/tests/test_remove_signature.py +++ b/tests/test_remove_signature.py @@ -4,136 +4,129 @@ from parameterized import parameterized from ai_data_preprocessing_queue.Pipeline import Pipeline -from ai_data_preprocessing_queue.Steps.remove_signature import ( - remove_greetings_and_following_text, remove_newline, step) +from ai_data_preprocessing_queue.Steps.remove_signature import remove_greetings_and_following_text, remove_newline, step class TestRemoveSignature(unittest.TestCase): - @parameterized.expand([ # type: ignore[untyped-decorator] - ( - "multiple_newlines", - "Could you please review the attached document?\n\n\nI need your feedback by Friday.", - "Could you please review the attached document? I need your feedback by Friday.", - ), - ( - "multiple_spaces", - "The meeting is scheduled for 3PM tomorrow.", - "The meeting is scheduled for 3PM tomorrow.", - ), - ( - "mixed_whitespace", - "Please find the report attached. \n\n The numbers look good \r\n\r\n for Q3!", - "Please find the report attached. The numbers look good for Q3!", - ), - ( - "empty_string", - "", - "" - ), - ( - "trailing_whitespace", - "I'll send the updated version tomorrow. \n\n ", - "I'll send the updated version tomorrow." - ) - ]) + @parameterized.expand( + [ # type: ignore[untyped-decorator] + ( + "multiple_newlines", + "Could you please review the attached document?\n\n\nI need your feedback by Friday.", + "Could you please review the attached document? I need your feedback by Friday.", + ), + ( + "multiple_spaces", + "The meeting is scheduled for 3PM tomorrow.", + "The meeting is scheduled for 3PM tomorrow.", + ), + ( + "mixed_whitespace", + "Please find the report attached. \n\n The numbers look good \r\n\r\n for Q3!", + "Please find the report attached. The numbers look good for Q3!", + ), + ("empty_string", "", ""), + ( + "trailing_whitespace", + "I'll send the updated version tomorrow. \n\n ", + "I'll send the updated version tomorrow.", + ), + ] + ) def test_remove_newline(self, name: str, input_text: str, expected: str) -> None: self.assertEqual(remove_newline(input_text), expected) - @parameterized.expand([ # type: ignore[untyped-decorator] - ( - "english_signature_basic", - "Here's the project update. Sincerely, John Smith\nProject Manager", - "Here's the project update." - ), - ( - "english_signature_with_content", - "Please review the attached documents. Best regards, Jane Doe\nSenior Developer\nTech Department", - "Please review the attached documents." - ), - ( - "english_signature_with_content_and_several_newlines", - "Please review the attached documents. Best regards,\nJane Doe\n\nSenior Developer\n\nTech Department", - "Please review the attached documents." - ), - ( - "german_signature", - "Die Unterlagen wurden aktualisiert. Mit freundlichen Grüßen, Hans Schmidt\nPhone: +49 123 456789", - "Die Unterlagen wurden aktualisiert." - ), - ( - "greeting_with_comma", - "Meeting is scheduled for tomorrow. Kind regards, Sarah", - "Meeting is scheduled for tomorrow." - ), - ( - "mixed_case_greeting", - "Report is ready. BEST REGARDS, Tom Wilson", - "Report is ready." - ), - ( - "multiple_greetings", - "Hello team, here's the update. Best regards, Jim\nRegards, HR Team", - "Hello team, here's the update." - ), - ( - "empty_string", - "", - "" - ), - ( - "no_greetings", - "This is a plain text without any greetings or signatures.", - "This is a plain text without any greetings or signatures." - ), - ]) + @parameterized.expand( + [ # type: ignore[untyped-decorator] + ( + "english_signature_basic", + "Here's the project update. Sincerely, John Smith\nProject Manager", + "Here's the project update.", + ), + ( + "english_signature_with_content", + "Please review the attached documents. Best regards, Jane Doe\nSenior Developer\nTech Department", + "Please review the attached documents.", + ), + ( + "english_signature_with_content_and_several_newlines", + "Please review the attached documents. Best regards,\nJane Doe\n\nSenior Developer\n\nTech Department", + "Please review the attached documents.", + ), + ( + "german_signature", + "Die Unterlagen wurden aktualisiert. Mit freundlichen Grüßen, Hans Schmidt\nPhone: +49 123 456789", + "Die Unterlagen wurden aktualisiert.", + ), + ( + "greeting_with_comma", + "Meeting is scheduled for tomorrow. Kind regards, Sarah", + "Meeting is scheduled for tomorrow.", + ), + ("mixed_case_greeting", "Report is ready. BEST REGARDS, Tom Wilson", "Report is ready."), + ( + "multiple_greetings", + "Hello team, here's the update. Best regards, Jim\nRegards, HR Team", + "Hello team, here's the update.", + ), + ("empty_string", "", ""), + ( + "no_greetings", + "This is a plain text without any greetings or signatures.", + "This is a plain text without any greetings or signatures.", + ), + ] + ) def test_remove_greetings_and_following_text(self, name: str, input_text: str, expected: str) -> None: self.assertEqual(remove_greetings_and_following_text(input_text), expected) - @parameterized.expand([ # type: ignore[untyped-decorator] - ( - "remove_signature_basic", - "We're sending the final draft for review. Best regards, Alice Johnson\nProject Lead", - "We're sending the final draft for review.", - ), - ( - "remove_signature_extended", - "Order Mice/keyboard\nGoodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 " - "10 x Dell Business Keyboard AB322 (UK layout) Thx Best regards Jimmy B. " - "| C Facilities & Reception Klaus+Andreas Nederland | Anonymstraat 47 | 1234 AJ Amsterdam | Netherlands " - "Phone: +01 23 695 4567 | Mobile: +97 65 445 1234 | Fax: +31 35 695 8825 jim.anonymus@company.com " - "| www.nl.somecompany.com", - "Order Mice/keyboard Goodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 " - "10 x Dell Business Keyboard AB322 (UK layout) Thx", - ), - ( - "thanking_at_start", - "Thank you very much for your support. " - "I will prepare the contract and send it tomorrow.\n\nBest regards, Bob Brown", - "I will prepare the contract and send it tomorrow.", - ), - ( - "thanking_in_middle", - "Thank you very much for your support. " - "I appreciate your support on this migration. Thanks a lot, I will share the logs shortly.", - "I appreciate your support on this migration. I will share the logs shortly.", - ), - ( - "single_greeting_word_german", - "The deliverables are ready. Grüße", - "The deliverables are ready.", - ), - ( - "german_empty_result", - "Vielen Dank für Ihre Hilfe. Mit freundlichen Grüßen, Lena Meyer " - "Und hier kommt noch mehr Text.", - "", - ), - ( - "no_change", - "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", - "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", - ), - ]) + @parameterized.expand( + [ # type: ignore[untyped-decorator] + ( + "remove_signature_basic", + "We're sending the final draft for review. Best regards, Alice Johnson\nProject Lead", + "We're sending the final draft for review.", + ), + ( + "remove_signature_extended", + "Order Mice/keyboard\nGoodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 " + "10 x Dell Business Keyboard AB322 (UK layout) Thx Best regards Jimmy B. " + "| C Facilities & Reception Klaus+Andreas Nederland | Anonymstraat 47 | 1234 AJ Amsterdam | " + "Netherlands " + "Phone: +01 23 695 4567 | Mobile: +97 65 445 1234 | Fax: +31 35 695 8825 jim.anonymus@company.com " + "| www.nl.somecompany.com", + "Order Mice/keyboard Goodmorning, Can you please order the following: 10 x Dell Laser Mouse IL3220 " + "10 x Dell Business Keyboard AB322 (UK layout) Thx", + ), + ( + "thanking_at_start", + "Thank you very much for your support. " + "I will prepare the contract and send it tomorrow.\n\nBest regards, Bob Brown", + "I will prepare the contract and send it tomorrow.", + ), + ( + "thanking_in_middle", + "Thank you very much for your support. " + "I appreciate your support on this migration. Thanks a lot, I will share the logs shortly.", + "I appreciate your support on this migration. I will share the logs shortly.", + ), + ( + "single_greeting_word_german", + "The deliverables are ready. Grüße", + "The deliverables are ready.", + ), + ( + "german_empty_result", + "Vielen Dank für Ihre Hilfe. Mit freundlichen Grüßen, Lena Meyer Und hier kommt noch mehr Text.", + "", + ), + ( + "no_change", + "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", + "Please schedule the kickoff meeting for next Tuesday morning at 10:00.", + ), + ] + ) def test_remove_signature(self, name: str, input_text: str, expected: str) -> None: pipeline = Pipeline({"remove_signature": None}) value = pipeline.consume(input_text) @@ -143,8 +136,10 @@ def test_remove_signature_step_empty_item(self) -> None: result = step("", {}, None, "") self.assertEqual(result, "") - @patch("ai_data_preprocessing_queue.Steps.remove_signature.remove_greetings_and_following_text", - side_effect=Exception("Test error")) + @patch( + "ai_data_preprocessing_queue.Steps.remove_signature.remove_greetings_and_following_text", + side_effect=Exception("Test error"), + ) def test_remove_signature_step_error(self, _: MagicMock) -> None: with self.assertRaises(Exception): step("Please schedule the kickoff meeting for next Tuesday morning at 10:00.", {}, None, "")