Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@ FROM mcr.microsoft.com/devcontainers/python:3.12-bookworm

# Uninstall pre-installed formatting and linting tools
# They would conflict with our pinned versions
RUN pipx uninstall flake8
RUN pipx uninstall mypy
12 changes: 2 additions & 10 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,15 @@
"vscode": {
"settings": {
"python.pythonPath": "/usr/local/bin/python",
"python.defaultInterpreterPath": "/usr/local/bin/python",
"python.languageServer": "Pylance",
"flake8.path": [
"/usr/local/py-utils/bin/bandit",
"/usr/local/py-utils/bin/pydocstyle"
],
"flake8.importStrategy": "fromEnvironment"
"python.defaultInterpreterPath": "/usr/local/bin/python"
},
"extensions": [
"AykutSarac.jsoncrack-vscode",
"charliermarsh.ruff",
"eamodio.gitlens",
"Gruntfuggly.todo-tree",
"matangover.mypy",
"ms-python.flake8",
"ms-python.isort",
"ms-python.mypy-type-checker",
"ms-python.pylint",
"ms-python.python",
"ms-python.vscode-pylance",
"njpwerner.autodocstring",
Expand Down
11 changes: 0 additions & 11 deletions .flake8

This file was deleted.

17 changes: 0 additions & 17 deletions .github/flake8-error-problem-matcher.json

This file was deleted.

17 changes: 0 additions & 17 deletions .github/flake8-warning-problem-matcher.json

This file was deleted.

80 changes: 80 additions & 0 deletions .github/workflows/01-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
name: 01 | ai-data-preprocessing-queue | continuous integration
run-name: State ${{ github.ref }}
on:
push:
branches:
- "**"
paths-ignore:
- README.md
- LICENSE.txt
- .gitignore
- .vscode/**
- .devcontainer/**

jobs:
changes:
name: changes
runs-on: ubuntu-latest
timeout-minutes: 1
outputs:
ai_data_preprocessingqueue: ${{ steps.filter.outputs.ai_data_preprocessing_queue }}
first_run: ${{ steps.first_run.outputs.first_run }}
steps:
- uses: actions/checkout@v4
- uses: SamhammerAG/first-build-action@v2.1
id: first_run
with:
workflow: 01 | ai-data-preprocessing-queue | continuous integration
branch: ${{ github.ref_name }}
- uses: dorny/paths-filter@v3
id: filter
with:
base: ${{ github.ref }}
filters: |
ai-data-preprocessing-queue:
- '.github/workflows/01-ci.yml'
- 'ai-data-preprocessing-queue/**'

build_python:
name: Python | build
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.12
- name: preparation | Add linting matchers
run: |
echo "::add-matcher::.github/mypy-error-problem-matcher.json"
- name: preparation | upgrade pip
if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true'
run: python -m pip install --upgrade pip
- name: ai-data-preprocessing-queue | Install dependencies
if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true'
run: pip install -r requirements-dev.txt
- name: preparation | Create failures dir
run: mkdir -p ./ruff_failures
- name: ai-data-preprocessing-queue | ruff check
if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true'
run: |
ruff check --output-format github . || echo "ai_data_preprocessing_queue" >> ./ruff_failures/ruff_failures.txt
- name: ai-data-preprocessing-queue | Lint typing
if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true'
# Must run with installed dependencies
# Must be executed from root with path to project as param because
# the problem matcher needs an absolute path in error message to work correctly
working-directory: ./
run: |
mypy --config-file .mypy.ini --show-column-numbers ai-data-preprocessing-queue
- name: Test ai-data-preprocessing-queue
if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true'
working-directory: ai-data-preprocessing-queue
run: pytest --cov app --cov-report=xml tests
- name: ai-data-preprocessing-queue | Report coverage
if: needs.changes.outputs.ai_data_preprocessing_queue == 'true' || needs.changes.outputs.first_run == 'true'
uses: MishaKav/pytest-coverage-comment@main
with:
title: Test coverage for changes on ai-data-preprocessing-queue
pytest-xml-coverage-path: ai-data-preprocessing-queue/coverage.xml
report-only-changed-files: true
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ jobs:
name: Build and/or publish Python 🐍 distributions 📦 to PyPI
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: >-
Expand Down
47 changes: 0 additions & 47 deletions .github/workflows/ci.yml

This file was deleted.

5 changes: 0 additions & 5 deletions .pylintrc

This file was deleted.

6 changes: 2 additions & 4 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,11 @@
"files.trimTrailingWhitespace": true,
"files.exclude": {
".coverage": true,
".flake8": true,
".gitattributes": true,
".gitignore": true,
".idea": true,
".mypy_cache": true,
".mypy.ini": true,
".pylintrc": true,
"*.egg-info": true,
"**/__pycache__": true,
"**/.pytest_cache": true,
Expand All @@ -29,8 +27,8 @@
"python.testing.pytestEnabled": true,
"python.testing.unittestEnabled": false,
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
"editor.formatOnSave": true,
"editor.defaultFormatter": "charliermarsh.ruff"
},
"python.analysis.diagnosticSeverityOverrides": {
"reportPrivateUsage": "information",
Expand Down
12 changes: 11 additions & 1 deletion .vscode/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,17 @@
{
"label": "Lint",
"type": "shell",
"command": "flake8 . --count --statistics; mypy .",
"command": "ruff check; mypy .",
"group": "test",
"presentation": {
"reveal": "always",
"panel": "new"
}
},
{
"label": "Format all",
"type": "shell",
"command": "ruff format",
"group": "test",
"presentation": {
"reveal": "always",
Expand Down
4 changes: 2 additions & 2 deletions ai_data_preprocessing_queue/Steps/language_detect.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""
Detects one of the following languages and writes the language to local state.
"""Detects one of the following languages and writes the language to local state.

af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi,
zh-cn, zh-tw
"""

from typing import Any

from langdetect import detect
Expand Down
3 changes: 1 addition & 2 deletions ai_data_preprocessing_queue/Steps/remove_numbers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@


def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
item = re.sub(r"""\d""", " ", item)
return item
return re.sub(r"""\d""", " ", item)
3 changes: 1 addition & 2 deletions ai_data_preprocessing_queue/Steps/remove_punctuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,4 @@


def step(item: Any, item_state: dict[str, Any], global_state: dict[str, Any] | None, preprocessor_data: str) -> Any:
item = re.sub(r"[^\w\s]", " ", item)
return item
return re.sub(r"[^\w\s]", " ", item)
57 changes: 35 additions & 22 deletions ai_data_preprocessing_queue/Steps/remove_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,30 @@ def remove_newline(text: str) -> str:
"""Remove excessive newlines or spaces from the text."""
pattern = re.compile(r"\s{2,}|[\n\r]{3,}")
result = pattern.sub(" ", text)
result = re.sub(r"\s+", " ", result).strip()

return result


GreetingExpressions = ["sincerely", "best regards", "happy holidays", "kind regards", "warm regards", "cheers",
"regards", "mit freundlichen grüßen", "freundliche grüße", "beste grüße", "viele grüße",
"herzliche grüße", "liebe grüße", "mit freundlichen grüssen", "freundliche grüsse",
"beste grüsse", "viele grüsse", "herzliche grüsse", "liebe grüsse"]
return re.sub(r"\s+", " ", result).strip()


GreetingExpressions = [
"sincerely",
"best regards",
"happy holidays",
"kind regards",
"warm regards",
"cheers",
"regards",
"mit freundlichen grüßen",
"freundliche grüße",
"beste grüße",
"viele grüße",
"herzliche grüße",
"liebe grüße",
"mit freundlichen grüssen",
"freundliche grüsse",
"beste grüsse",
"viele grüsse",
"herzliche grüsse",
"liebe grüsse",
]
greetings_regex = r"(" + "|".join(GreetingExpressions) + r")\s*,?\s*"


Expand All @@ -26,15 +41,15 @@ def remove_greetings_and_following_text(text: str) -> str:
# thank you expressions should be removed after greetings and following signature text,
# as they often appear at the beginning of a message
THANK_EXPRESSIONS = [
r"thank you(?: very much)?", # thank you, thank you very much
r"thankyou(?: very much)?", # thankyou, thankyou very much
r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again
r"many thanks", # many thanks
r"a thousand thanks", # a thousand thanks
r"danke(?: schön)?", # danke, danke schön, danke und
r"vielen dank", # vielen dank
r"dankeschön", # dankeschön
r"besten dank" # besten dank
r"thank you(?: very much)?", # thank you, thank you very much
r"thankyou(?: very much)?", # thankyou, thankyou very much
r"thanks(?: a lot| again)?", # thanks, thanks a lot, thanks again
r"many thanks", # many thanks
r"a thousand thanks", # a thousand thanks
r"danke(?: schön)?", # danke, danke schön, danke und
r"vielen dank", # vielen dank
r"dankeschön", # dankeschön
r"besten dank", # besten dank
]

# Suffixes which could follow thank you expressions
Expand All @@ -43,15 +58,13 @@ def remove_greetings_and_following_text(text: str) -> str:
r"(?:for (?:your|the) (?:help|support|understanding|assistance))",
r"(?:schon mal\s+)?(?:im voraus\s+)?für\s+(?:ihre|ihr|eure|die|den)\s+(?:hilfe|support|verständnis)",
r"vorab",
r"kindly?"
r"kindly?",
]

# Combine them into a final regex pattern and compile
thank_expressions = r"|".join(THANK_EXPRESSIONS)
suffixes = r"(?:\s+(?:" + r"|".join(THANK_SUFFIXES) + r"))?"
final_pattern = (
r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
)
final_pattern = r"\b(?:" + thank_expressions + r")" + suffixes + r"\s*(?:,|\.|!|;)?\s*"
thanking_regex = re.compile(final_pattern, flags=re.IGNORECASE | re.UNICODE)


Expand Down
Loading
Loading