4945 lines
229 KiB
Python
4945 lines
229 KiB
Python
import os
|
||
import logging
|
||
import hashlib
|
||
import json
|
||
import re
|
||
import time
|
||
import base64
|
||
from typing import Dict, List, Optional, Any, Union, Tuple
|
||
from datetime import datetime
|
||
from openai import OpenAI
|
||
from dotenv import load_dotenv
|
||
from document_processor import DocumentProcessor
|
||
from shipping_calculator import calculate_shipping_cost
|
||
|
||
try:
|
||
from container_reference import iso_reference_prompt_block
|
||
except ImportError:
|
||
def iso_reference_prompt_block() -> str:
|
||
return ""
|
||
|
||
load_dotenv()
|
||
logger = logging.getLogger(__name__)
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||
|
||
_SHIPPING_DIR = os.path.dirname(__file__)
|
||
|
||
|
||
def _normalize_dimension_unit_field(raw: Any) -> Optional[str]:
|
||
"""Значение поля unit / dimension_unit из JSON: cm | mm | m или None."""
|
||
if raw is None:
|
||
return None
|
||
s = str(raw).strip().lower()
|
||
if s in ("mm", "мм", "millimeter", "millimeters", "миллиметр", "миллиметры"):
|
||
return "mm"
|
||
if s in ("cm", "см", "centimeter", "centimeters", "сантиметр", "сантиметры"):
|
||
return "cm"
|
||
if s in ("m", "meter", "metre", "metres", "meters", "метр", "метры", "м"):
|
||
return "m"
|
||
return None
|
||
|
||
|
||
def _line_hint_dimension_unit(line: str) -> Optional[str]:
|
||
"""По строке письма/таблицы: явные см/мм рядом с габаритами."""
|
||
if not isinstance(line, str) or not line.strip():
|
||
return None
|
||
ll = line.lower()
|
||
has_mm = bool(re.search(r"(?<![a-zа-яё])(мм|mm)(?![a-zа-яё])", ll))
|
||
has_cm = bool(re.search(r"(?<![a-zа-яё])(см|cm)(?![a-zа-яё])", ll))
|
||
if has_mm and not has_cm:
|
||
return "mm"
|
||
if has_cm and not has_mm:
|
||
return "cm"
|
||
if has_mm and has_cm:
|
||
# оба упомянуты — чаще в переписке дают см; не делим на 10 без других признаков
|
||
return "cm"
|
||
return None
|
||
|
||
|
||
def infer_cargo_triple_raw_to_cm(
|
||
a: float,
|
||
b: float,
|
||
c: float,
|
||
*,
|
||
explicit_unit: Optional[str] = None,
|
||
) -> Tuple[float, float, float]:
|
||
"""
|
||
Тройка чисел без суффикса _mm в JSON: часто см или мм.
|
||
Раньше max>120 считалось мм и делилось на 10 — ломало типичные см (150×80×60).
|
||
Явная единица (поле или текст) имеет приоритет; иначе осторожная эвристика.
|
||
"""
|
||
if explicit_unit == "mm":
|
||
return (a / 10.0, b / 10.0, c / 10.0)
|
||
if explicit_unit == "m":
|
||
return (a * 100.0, b * 100.0, c * 100.0)
|
||
if explicit_unit == "cm":
|
||
return (a, b, c)
|
||
|
||
mx, mn = max(a, b, c), min(a, b, c)
|
||
# Типичные размеры коробки в мм как «сырые» числа в полях length_cm: 1200×800×600
|
||
if mx >= 1000.0:
|
||
return (a / 10.0, b / 10.0, c / 10.0)
|
||
# Плоские места в мм: 1200×800×40 и т.п.
|
||
if mx >= 600.0 and mn <= 100.0:
|
||
return (a / 10.0, b / 10.0, c / 10.0)
|
||
# Иначе считаем, что уже сантиметры (в т.ч. 150×80×60, 400×300×200)
|
||
return (a, b, c)
|
||
|
||
|
||
def _max_llm_context_chars() -> int:
|
||
"""
|
||
Максимальная длина строки контекста (письма + полный текст вложений) для запроса к LLM.
|
||
Переменная окружения RAG_MAX_CONTEXT_CHARS:
|
||
- не задана → 0 (без обрезки; при необходимости ограничения задайте положительное число);
|
||
- 0 → без обрезки (осторожно: очень большие письма/вложения увеличат запрос к API).
|
||
"""
|
||
raw = os.getenv("RAG_MAX_CONTEXT_CHARS", "0").strip()
|
||
if raw == "0":
|
||
return 0
|
||
try:
|
||
n = int(raw)
|
||
return n if n > 0 else 400_000
|
||
except ValueError:
|
||
return 400_000
|
||
|
||
|
||
def _max_cargo_segment_chars() -> int:
|
||
"""
|
||
Лимит длины сегмента cargo_description при разбиении по кодам позиций.
|
||
RAG_MAX_SEGMENT_CHARS: не задана → 2_000_000; 0 → без обрезки.
|
||
"""
|
||
raw = os.getenv("RAG_MAX_SEGMENT_CHARS", "2000000").strip()
|
||
if raw == "0":
|
||
return 0
|
||
try:
|
||
n = int(raw)
|
||
return n if n > 0 else 2_000_000
|
||
except ValueError:
|
||
return 2_000_000
|
||
|
||
|
||
def _criteria_preview_chars() -> int:
|
||
"""
|
||
Лимит символов поля criteria каждого типа перевозки в JSON промпта (не письма).
|
||
RAG_CRITERIA_PREVIEW_CHARS: по умолчанию 8000; 0 — без обрезки, целиком как в shipping_types.
|
||
"""
|
||
raw = os.getenv("RAG_CRITERIA_PREVIEW_CHARS", "8000").strip()
|
||
try:
|
||
return int(raw)
|
||
except ValueError:
|
||
return 8000
|
||
|
||
|
||
def _mandatory_counterparty_chars() -> int:
|
||
"""
|
||
То же для mandatory_counterparty_criteria в промпте. По умолчанию 4000; 0 — без обрезки.
|
||
"""
|
||
raw = os.getenv("RAG_MANDATORY_COUNTERPARTY_CHARS", "4000").strip()
|
||
try:
|
||
return int(raw)
|
||
except ValueError:
|
||
return 4000
|
||
|
||
|
||
def _limit_prompt_field(text: str, max_chars: int) -> str:
|
||
"""Обрезка только для вставки длинных полей справочника в промпт; max_chars <= 0 — полный текст."""
|
||
s = text if isinstance(text, str) else ""
|
||
if max_chars <= 0:
|
||
return s
|
||
return s[:max_chars]
|
||
|
||
|
||
def _json_prompt_compact(obj: Any) -> str:
|
||
"""JSON для LLM без отступов и лишних пробелов — меньше токенов, чем indent=2."""
|
||
return json.dumps(obj, ensure_ascii=False, separators=(",", ":"))
|
||
|
||
|
||
def _email_thread_dedupe_enabled() -> bool:
|
||
"""RAG_EMAIL_THREAD_DEDUPE=0|false — не удалять повторяющиеся абзацы в теле для промпта."""
|
||
raw = os.getenv("RAG_EMAIL_THREAD_DEDUPE", "1").strip().lower()
|
||
if raw in ("0", "false", "no", "off", "disable", "disabled"):
|
||
return False
|
||
return True
|
||
|
||
|
||
def _dedupe_email_thread_paragraphs(text: str, min_para_chars: int = 48) -> str:
|
||
"""
|
||
Убирает дословно повторяющиеся абзацы (типичные дисклеймеры «confidential…» в цепочке FW/RE).
|
||
Не режет текст по длине — только дубликаты блоков.
|
||
"""
|
||
if not isinstance(text, str) or not text.strip():
|
||
return text
|
||
parts = re.split(r"(\n\s*\n)", text)
|
||
out: List[str] = []
|
||
seen: set[str] = set()
|
||
for i, part in enumerate(parts):
|
||
if i % 2 == 1:
|
||
out.append(part)
|
||
continue
|
||
key = part.strip()
|
||
if len(key) < min_para_chars:
|
||
out.append(part)
|
||
continue
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
out.append(part)
|
||
s = "".join(out)
|
||
s = re.sub(r"\n{5,}", "\n\n\n\n", s)
|
||
return s
|
||
|
||
|
||
def _shipping_types_prompt_k() -> int:
|
||
"""
|
||
Сколько типов перевозок подставлять в основной промпт (по скорингу keywords по тексту писем).
|
||
0 — все типы (как раньше). 8–15 обычно сильно режет токены без обрезки писем/вложений.
|
||
RAG_SHIPPING_TYPES_PROMPT_K
|
||
"""
|
||
raw = os.getenv("RAG_SHIPPING_TYPES_PROMPT_K", "0").strip()
|
||
try:
|
||
return int(raw)
|
||
except ValueError:
|
||
return 0
|
||
|
||
|
||
def _report_cache_enabled() -> bool:
|
||
"""RAG_REPORT_CACHE=0|false отключает дисковый кэш результатов query_cargo_info."""
|
||
raw = os.getenv("RAG_REPORT_CACHE", "1").strip().lower()
|
||
if raw in ("0", "false", "no", "off", "disable", "disabled"):
|
||
return False
|
||
return True
|
||
|
||
|
||
def _report_cache_dir() -> str:
|
||
base = os.getenv("RAG_CACHE_DIR", "").strip()
|
||
if not base:
|
||
base = os.path.join(_SHIPPING_DIR, ".rag_cache")
|
||
return os.path.join(base, "cargo_reports")
|
||
|
||
|
||
def _report_cache_bust_token() -> str:
|
||
"""Произвольная строка для инвалидации кэша после смены логики промпта (RAG_CACHE_BUST)."""
|
||
return (os.getenv("RAG_CACHE_BUST") or "").strip()
|
||
|
||
|
||
def _learning_store_path() -> str:
|
||
raw = (os.getenv("RAG_LEARNING_STORE") or "").strip()
|
||
if raw:
|
||
return raw
|
||
return os.path.join(_SHIPPING_DIR, "cargo_rag_learning.json")
|
||
|
||
|
||
def _learning_auto_enabled() -> bool:
|
||
"""RAG_LEARNING_AUTO=0 отключает автосохранение пар «контекст → structured_data» после анализа."""
|
||
raw = (os.getenv("RAG_LEARNING_AUTO") or "1").strip().lower()
|
||
return raw not in ("0", "false", "no", "off", "disable", "disabled")
|
||
|
||
|
||
def _learning_few_shot_enabled() -> bool:
|
||
"""RAG_LEARNING_FEW_SHOT=0 не подмешивает примеры в промпт."""
|
||
raw = (os.getenv("RAG_LEARNING_FEW_SHOT") or "1").strip().lower()
|
||
return raw not in ("0", "false", "no", "off", "disable", "disabled")
|
||
|
||
|
||
def _learning_max_store() -> int:
|
||
try:
|
||
n = int((os.getenv("RAG_LEARNING_MAX") or "50").strip())
|
||
return max(1, min(n, 500))
|
||
except ValueError:
|
||
return 50
|
||
|
||
|
||
def _learning_few_shot_count() -> int:
|
||
try:
|
||
n = int((os.getenv("RAG_LEARNING_FEW_SHOT_N") or "2").strip())
|
||
return max(0, min(n, 5))
|
||
except ValueError:
|
||
return 2
|
||
|
||
|
||
def _context_word_set(text: str, max_words: int = 400) -> set[str]:
|
||
if not isinstance(text, str) or not text.strip():
|
||
return set()
|
||
words = re.findall(r"[a-zA-Zа-яА-ЯёЁ0-9]{3,}", text.lower())
|
||
stop = {
|
||
"the", "and", "for", "not", "this", "that", "with", "from",
|
||
"для", "это", "все", "как", "что", "при", "или", "также", "если",
|
||
"когда", "будет", "есть", "были", "было", "этот", "этой",
|
||
}
|
||
out: set[str] = set()
|
||
for w in words:
|
||
if w in stop:
|
||
continue
|
||
out.add(w)
|
||
if len(out) >= max_words:
|
||
break
|
||
return out
|
||
|
||
|
||
def _truncate_structured_for_learning(sd: Any, *, max_shipments: int = 2, max_str: int = 400) -> Dict:
|
||
"""Укорачивает JSON для хранения и few-shot, чтобы не раздувать промпт."""
|
||
if not isinstance(sd, dict):
|
||
return {}
|
||
|
||
def _trim(val: Any, depth: int = 0) -> Any:
|
||
if depth > 6:
|
||
return None
|
||
if isinstance(val, str):
|
||
s = val.strip()
|
||
return s[:max_str] + ("…" if len(s) > max_str else "")
|
||
if isinstance(val, (int, float, bool)) or val is None:
|
||
return val
|
||
if isinstance(val, list):
|
||
out_l: List[Any] = []
|
||
cap = 12 if depth == 0 else 8
|
||
for i, x in enumerate(val):
|
||
if i >= cap:
|
||
break
|
||
out_l.append(_trim(x, depth + 1))
|
||
return out_l
|
||
if isinstance(val, dict):
|
||
out_d: Dict[str, Any] = {}
|
||
for i, (k, v) in enumerate(val.items()):
|
||
if i >= 40:
|
||
break
|
||
if isinstance(k, str):
|
||
out_d[k.strip()] = _trim(v, depth + 1)
|
||
return out_d
|
||
return str(val)[:max_str]
|
||
|
||
shipments = sd.get("shipments")
|
||
if isinstance(shipments, list) and len(shipments) > max_shipments:
|
||
sd = {**sd, "shipments": shipments[:max_shipments]}
|
||
return _trim(sd, 0) or {}
|
||
|
||
|
||
SHIPPING_TYPES_CANDIDATE_FILES = [
|
||
os.path.join(_SHIPPING_DIR, "shipping_types.json"),
|
||
os.path.join(_SHIPPING_DIR, "shipping_types(1).json"),
|
||
]
|
||
SHIPPING_TYPES_FILE = SHIPPING_TYPES_CANDIDATE_FILES[0]
|
||
PROCESSED_SHIPPING_TYPES_FILE = os.path.join(
|
||
_SHIPPING_DIR, "shipping_types_processed.json"
|
||
)
|
||
|
||
|
||
def resolve_shipping_types_path() -> Optional[str]:
|
||
for path in SHIPPING_TYPES_CANDIDATE_FILES:
|
||
if os.path.exists(path):
|
||
return path
|
||
return None
|
||
|
||
|
||
def _fallback_process_shipping_type_criteria_static(criteria_text: str) -> str:
|
||
"""
|
||
Быстрый детерминированный fallback для criteria_ai.
|
||
Это нужно, чтобы отчёты сразу перестали показывать "сырой" длинный список,
|
||
даже если файл shipping_types_processed.json ещё не заполнен.
|
||
"""
|
||
if not isinstance(criteria_text, str) or not criteria_text.strip():
|
||
return ""
|
||
text = criteria_text.lower()
|
||
|
||
label_rules = [
|
||
("название клиента" in text, "Клиент"),
|
||
(("код тн вэд" in text) or ("код тн" in text), "Код ТН ВЭД"),
|
||
(("инкотерм" in text), "Условия поставки Incoterms"),
|
||
(("адрес забора" in text) or ("порт отправления" in text) or ("порт погрузки" in text), "Адрес забора груза"),
|
||
(("адрес доставки" in text) or ("порт назначения" in text) or ("порт выгрузки" in text), "Адрес доставки"),
|
||
(("стоимость груза" in text), "Стоимость груза"),
|
||
(("количество грузовых мест" in text) or ("количество мест" in text), "Количество грузовых мест"),
|
||
(("общий вес" in text), "Вес груза"),
|
||
(
|
||
("габарит" in text)
|
||
and (
|
||
"транспортн" in text
|
||
or "машин" in text
|
||
or "средств" in text
|
||
or "кузов" in text
|
||
or "прицеп" in text
|
||
or "полуприцеп" in text
|
||
),
|
||
"Габариты машины",
|
||
),
|
||
(("габарит" in text), "Габариты"),
|
||
(("общий объ" in text) or ("общий объем" in text) or ("объём" in text), "Объём"),
|
||
(("характер груза" in text) or ("наименование груза" in text), "Характер груза"),
|
||
(("опасные вещества" in text) or ("опасные свойства" in text), "Опасные свойства"),
|
||
(("msds" in text) or ("паспорт безопасности" in text), "Требуется MSDS"),
|
||
(("dgm" in text) or ("dangerous goods management" in text), "Требуется DGM"),
|
||
(("авторизац" in text), "Разрешение бренда"),
|
||
(("название бренда" in text) or ("необходимо указать название бренда" in text), "Бренд"),
|
||
(("экспортная лиценз" in text) or ("экспортной лиценз" in text) or ("экспорт" in text and "лиценз" in text), "Лицензия экспортёра"),
|
||
(("температурный режим" in text) or ("температур" in text), "Температурный режим"),
|
||
]
|
||
|
||
labels: List[str] = [label for ok, label in label_rules if ok]
|
||
deduped = list(dict.fromkeys(labels))
|
||
return "; ".join(deduped) if deduped else ""
|
||
|
||
#=============================================================================
|
||
#Словарь соответствия технических ключей и понятных названий
|
||
#=============================================================================
|
||
FIELD_LABELS = {
|
||
"client_name": "Клиент",
|
||
"incoterms": "Условия поставки Incoterms",
|
||
"cargo_ready_date": "Дата готовности груза",
|
||
"pickup_address": "Адрес забора груза",
|
||
"delivery_address": "Адрес доставки",
|
||
|
||
# добавляем синонимы
|
||
"pickup_address_alt1": "Адрес забора",
|
||
"pickup_address_alt2": "Адрес отправки",
|
||
"pickup_address_alt3": "Место забора",
|
||
|
||
"delivery_address_alt1": "Адрес получения",
|
||
"delivery_address_alt2": "Адрес назначения",
|
||
"cargo_value": "Стоимость груза",
|
||
"package_count": "Количество грузовых мест",
|
||
"total_weight_kg": "Вес груза",
|
||
"dimensions": "Габариты",
|
||
"total_volume_cbm": "Объём",
|
||
"cargo_description": "Характер груза",
|
||
"delivery_address": "Адрес доставки",
|
||
"hs_code": "Код ТН ВЭД",
|
||
"dangerous_goods": "Опасные свойства",
|
||
"msds_required": "Требуется MSDS",
|
||
"estimated_cost": "Стоимость перевозки",
|
||
"estimated_transit_time": "Срок доставки",
|
||
"dimensions_str": "Габариты",
|
||
"dangerous_goods_str": "Опасные свойства",
|
||
"missing_fields": "Необходимая информация",
|
||
"pickup_address": "Порт погрузки",
|
||
"delivery_address": "Порт выгрузки",
|
||
"shipment_type": "Тип перевозки",
|
||
"container_type": "Тип контейнера",
|
||
"vehicle_type": "Тип транспорта",
|
||
"temperature_range": "Температурный режим",
|
||
"vehicle_dimensions": "Габариты машины",
|
||
"vehicle_dimensions_str": "Габариты машины",
|
||
"stackable_with_others": "Штабелирование с другими отправками",
|
||
"customs_clearance_place_export_rf": "Место таможенного оформления (экспорт РФ)",
|
||
"dangerous_goods_clarification": "Батарейки, газы, жидкости, аэрозоли в грузе (уточнение по категориям)",
|
||
}
|
||
|
||
#=============================================================================
|
||
#Обратный словарь: понятное название → технический ключ
|
||
#=============================================================================
|
||
LABEL_TO_FIELD = {v: k for k, v in FIELD_LABELS.items()}
|
||
|
||
|
||
def _dangerous_goods_unspecified(shipment: Dict) -> bool:
|
||
dg = shipment.get("dangerous_goods")
|
||
dg_keys = ("batteries", "gases", "liquids", "dry_ice")
|
||
if isinstance(dg, dict):
|
||
for k in dg_keys:
|
||
if dg.get(k) is True:
|
||
return False
|
||
if all(dg.get(k) is False for k in dg_keys):
|
||
return False
|
||
if any(dg.get(k) is False for k in dg_keys):
|
||
return False
|
||
note = shipment.get("dangerous_goods_note")
|
||
if isinstance(note, str) and note.strip():
|
||
return False
|
||
return True
|
||
|
||
|
||
def collect_extra_required_missing(shipment: Dict, st_def: Optional[Dict]) -> List[str]:
|
||
"""Поля из shipping_type.extra_required_fields — в запрос клиенту при отсутствии данных."""
|
||
if not isinstance(st_def, dict):
|
||
return []
|
||
extra = st_def.get("extra_required_fields")
|
||
if not isinstance(extra, list):
|
||
return []
|
||
out: List[str] = []
|
||
for field in extra:
|
||
if not isinstance(field, str) or not field.strip():
|
||
continue
|
||
field = field.strip()
|
||
if field == "dangerous_goods_clarification":
|
||
if _dangerous_goods_unspecified(shipment):
|
||
out.append(field)
|
||
continue
|
||
if field == "stackable_with_others":
|
||
if shipment.get("stackable_with_others") is None:
|
||
out.append(field)
|
||
continue
|
||
if field == "hs_code":
|
||
v = shipment.get("hs_code")
|
||
if v is None or (isinstance(v, str) and not str(v).strip()):
|
||
out.append(field)
|
||
continue
|
||
if field == "vehicle_type":
|
||
v = shipment.get("vehicle_type")
|
||
if v is None or (isinstance(v, str) and not str(v).strip()):
|
||
out.append(field)
|
||
continue
|
||
if field == "container_type":
|
||
v = shipment.get("container_type")
|
||
if v is None or (isinstance(v, str) and not str(v).strip()):
|
||
out.append(field)
|
||
continue
|
||
if field == "customs_clearance_place_export_rf":
|
||
v = shipment.get("customs_clearance_place_export_rf")
|
||
if v is None or (isinstance(v, str) and not str(v).strip()):
|
||
out.append(field)
|
||
continue
|
||
if field == "total_volume_cbm":
|
||
val = shipment.get("total_volume_cbm")
|
||
if val is None or val == "":
|
||
out.append(field)
|
||
elif isinstance(val, (int, float)) and float(val) <= 0:
|
||
out.append(field)
|
||
continue
|
||
return out
|
||
|
||
|
||
def infer_container_load_mode_from_text(text_lower: str) -> Optional[str]:
|
||
"""
|
||
Оценка FCL/LCL по тексту (контейнерные море/ж/д). None — если противоречиво или нет признаков.
|
||
"""
|
||
if not text_lower or not isinstance(text_lower, str):
|
||
return None
|
||
t = text_lower.lower()
|
||
lcl = bool(
|
||
re.search(
|
||
r"\blcl\b|сборн\w*(\s+груз|\s+контейнер|\s+перевоз|\s+отправк)|"
|
||
r"группаж|консолидац\w*|consolidat\w*|groupage|"
|
||
r"less\s*than\s*container|неполн\w*\s+контейнер|"
|
||
r"дол\w*\s+в\s+контейнер|ко[-\s]?лоад|cfs\b|lcl\s*cargo",
|
||
t,
|
||
re.IGNORECASE,
|
||
)
|
||
)
|
||
fcl = bool(
|
||
re.search(
|
||
r"\bfcl\b|full\s*container\s*load|"
|
||
r"цельн\w*\s+контейнер|полн\w*\s+контейнер|"
|
||
r"отдельн\w*\s+контейнер|exclusive\s*use|"
|
||
r"soc\s*container|shipper['\u2019]s?\s*own\s*container",
|
||
t,
|
||
re.IGNORECASE,
|
||
)
|
||
)
|
||
if lcl and not fcl:
|
||
return "LCL"
|
||
if fcl and not lcl:
|
||
return "FCL"
|
||
if lcl and fcl:
|
||
return None
|
||
if re.search(
|
||
r"\d{1,3}\s*[x×х*]\s*(20|40|45)\s*['′'`´]?\s*"
|
||
r"(dc|hc|hq|gp|dv|dry|rf|rh|reefer|ot|fr|tk|tank)?\b",
|
||
t,
|
||
re.IGNORECASE,
|
||
):
|
||
return "FCL"
|
||
return None
|
||
|
||
|
||
def _shipping_type_name_implies_load_mode(name: str) -> Optional[str]:
|
||
if not name:
|
||
return None
|
||
n = name.lower()
|
||
if "(lcl)" in n:
|
||
return "LCL"
|
||
if "(fcl)" in n:
|
||
return "FCL"
|
||
return None
|
||
|
||
|
||
def _is_parallel_multi_cargo_order_table(text: str) -> bool:
|
||
"""
|
||
Несколько самостоятельных партий в одной таблице (столбцы с разными номерами груза/заказа).
|
||
Отличается от тендера: разные инвойсы, разные Total, разные требования по DG — это НЕ один shipment.
|
||
"""
|
||
if not isinstance(text, str) or len(text.strip()) < 80:
|
||
return False
|
||
tl = text.lower()
|
||
if not re.search(r"номер\s+груз", tl):
|
||
return False
|
||
troc = re.findall(r"\btrocyps[-_]?\d+\b", text, flags=re.IGNORECASE)
|
||
uniq_troc = {x.lower().replace("_", "-") for x in troc}
|
||
if len(uniq_troc) >= 2:
|
||
if tl.count("total:") >= 2 or len(re.findall(r"\$\s*[\d\s]{4,}", text)) >= 2:
|
||
return True
|
||
return False
|
||
|
||
|
||
def _strong_invoice_value_mismatch(a: str, b: str) -> bool:
|
||
"""Два ненулевых инвойса в $ с отношением >= 2 — разные партии, не склеивать агрессивно."""
|
||
|
||
def _grab_usd(s: str) -> Optional[float]:
|
||
if not isinstance(s, str):
|
||
return None
|
||
m = re.search(r"\$\s*([\d\s.,]+)", s)
|
||
if not m:
|
||
return None
|
||
raw = m.group(1).replace(" ", "").replace("\xa0", "")
|
||
if raw.count(",") == 1 and raw.count(".") == 0 and len(raw.split(",")[-1]) == 2:
|
||
raw = raw.replace(",", ".")
|
||
else:
|
||
raw = raw.replace(",", "")
|
||
try:
|
||
v = float(raw)
|
||
return v if v > 0 else None
|
||
except ValueError:
|
||
return None
|
||
|
||
x, y = _grab_usd(a or ""), _grab_usd(b or "")
|
||
if x is None or y is None:
|
||
return False
|
||
lo, hi = (x, y) if x < y else (y, x)
|
||
return (hi / lo) >= 2.0
|
||
|
||
|
||
def _is_tender_multi_origin_context(text: str) -> bool:
|
||
"""
|
||
Тендер / одна заявка с несколькими адресами забора и одной выгрузкой.
|
||
Такие письма нельзя дробить на отдельные shipments по каждой точке.
|
||
"""
|
||
if not isinstance(text, str) or len(text.strip()) < 120:
|
||
return False
|
||
if _is_parallel_multi_cargo_order_table(text):
|
||
return False
|
||
t = text.lower()
|
||
score = 0
|
||
if re.search(r"тендер", t):
|
||
score += 2
|
||
if re.search(
|
||
r"приём\s+предложений|прием\s+предложений|приём\s+оферт|прием\s+оферт",
|
||
t,
|
||
):
|
||
score += 1
|
||
if re.search(r"адрес\s+загрузки\s*\d+", t):
|
||
score += 2
|
||
n_pickups = len(re.findall(r"адрес\s+загрузки\s*\d+", t))
|
||
if n_pickups >= 3:
|
||
score += 3
|
||
elif n_pickups >= 2:
|
||
score += 1
|
||
if re.search(r"\bплеч\w*\b", t) and re.search(r"(?:жд|море|порт|станц)", t):
|
||
score += 1
|
||
if re.search(r"\bozon\b", t) and (n_pickups >= 2 or "стм" in t):
|
||
score += 1
|
||
return score >= 4
|
||
|
||
|
||
# Английские плейсхолдеры в confirmation_template: (Label) -> поле данных
|
||
BRACKET_LABEL_ALIASES_EN: List[tuple[str, str]] = [
|
||
("Pickup address", "pickup_address"),
|
||
("Delivery address", "delivery_address"),
|
||
("Port of loading", "loading_port"),
|
||
("Port of discharge", "discharge_port"),
|
||
("Cargo weight", "total_weight_kg"),
|
||
("Number of packages", "package_count"),
|
||
("Volume", "total_volume_cbm"),
|
||
("Dimensions", "dimensions_str"),
|
||
("Cargo description", "cargo_description"),
|
||
("HS code", "hs_code"),
|
||
("Dangerous properties", "dangerous_goods_str"),
|
||
("MSDS required", "msds_required"),
|
||
("Freight cost", "estimated_cost"),
|
||
("Transit time", "estimated_transit_time"),
|
||
("Shipment type", "shipment_type"),
|
||
("Container type", "container_type"),
|
||
("Vehicle type", "vehicle_type"),
|
||
("Vehicle dimensions", "vehicle_dimensions_str"),
|
||
("Temperature regime", "temperature_range"),
|
||
("Temperature range", "temperature_range"),
|
||
]
|
||
|
||
#=============================================================================
|
||
# ОТОБРАЖЕНИЕ КОНТЕЙНЕРОВ (Nxтип без голых 40HC/20DC)
|
||
#=============================================================================
|
||
_CONTAINER_BARE_ONLY = re.compile(
|
||
r"^\s*(20|40|45)\s*['′'`´]?\s*(?:ft|feet|ф)?\s*"
|
||
r"(dc|hc|hq|gp|dv|dry|rf|rh|reefer|ref|ot|fr|tk|tank)\s*$",
|
||
re.IGNORECASE,
|
||
)
|
||
_CONTAINER_NX = re.compile(
|
||
r"(?<![\d.])(?P<n>\d{1,3})\s*[x×х**]\s*"
|
||
r"(?P<t>(?:20|40|45)\s*['′'`´]?\s*(?:ft|feet|ф)?\s*"
|
||
r"(?:dc|hc|hq|gp|dv|dry|rf|rh|reefer|ref|ot|fr|tk|tank)\b)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
def _normalize_container_token(n_str: str, type_part: str) -> str:
|
||
n = int(n_str)
|
||
t = re.sub(r"\s+", "", type_part.strip())
|
||
return f"{n}x{t}"
|
||
|
||
|
||
def normalize_container_type_display(raw: Optional[str], *, empty: str = "") -> str:
|
||
if raw is None:
|
||
return empty
|
||
s = str(raw).strip()
|
||
if not s:
|
||
return empty
|
||
|
||
tokens: List[str] = []
|
||
for m in _CONTAINER_NX.finditer(s):
|
||
tokens.append(_normalize_container_token(m.group("n"), m.group("t")))
|
||
|
||
if tokens:
|
||
return ", ".join(tokens)
|
||
|
||
kept: List[str] = []
|
||
for part in re.split(r"[,;]+", s):
|
||
p = part.strip()
|
||
if not p:
|
||
continue
|
||
if _CONTAINER_BARE_ONLY.match(p):
|
||
continue
|
||
if _CONTAINER_NX.search(p):
|
||
for m in _CONTAINER_NX.finditer(p):
|
||
kept.append(_normalize_container_token(m.group("n"), m.group("t")))
|
||
continue
|
||
if re.match(r"^\d+\s*[x×х**]", p, re.IGNORECASE):
|
||
kept.append(
|
||
re.sub(r"\s*([x×х**])\s*", "x", p, count=1, flags=re.IGNORECASE)
|
||
)
|
||
|
||
return ", ".join(kept) if kept else empty
|
||
|
||
|
||
#=============================================================================
|
||
#КЛАСС ДЛЯ АВТОЗАПОЛНЕНИЯ ШАБЛОНОВ
|
||
#=============================================================================
|
||
class AutoFillDict(dict):
|
||
"""
|
||
Словарь, который автоматически возвращает значения для любых ключей.
|
||
Если ключ не найден — пытается найти по понятному названию (из FIELD_LABELS).
|
||
Идеально для шаблонов писем без явных {placeholder}.
|
||
"""
|
||
def __init__(self, data: dict, field_labels: dict = None):
|
||
super().__init__(data)
|
||
self._data = data
|
||
self._field_labels = field_labels or FIELD_LABELS
|
||
self._label_to_field = {v: k for k, v in self._field_labels.items()}
|
||
|
||
def __missing__(self, key):
|
||
# Если ключ есть в оригинальных данных — возвращаем значение
|
||
if key in self._data:
|
||
val = self._data[key]
|
||
return str(val) if val is not None else ""
|
||
|
||
# Пытаемся найти по понятному названию (например "Клиент" → "client_name")
|
||
if key in self._label_to_field:
|
||
field_key = self._label_to_field[key]
|
||
if field_key in self._data:
|
||
val = self._data[field_key]
|
||
return str(val) if val is not None else ""
|
||
|
||
# Если не нашли — возвращаем пустую строку (чтобы не было ошибок)
|
||
return ""
|
||
|
||
#=============================================================================
|
||
#ФУНКЦИЯ АВТОЗАПОЛНЕНИЯ ШАБЛОНА
|
||
#=============================================================================
|
||
def auto_fill_template(template: str, data: Dict, field_labels: Dict = None) -> str:
|
||
"""
|
||
Автоматически находит и подставляет значения полей в шаблон.
|
||
Поддерживает два формата:
|
||
- {technical_key} — технический ключ
|
||
- (Понятное название) — русское название в скобках
|
||
- (English label) — английские подписи из BRACKET_LABEL_ALIASES_EN
|
||
❌ НЕ заменяет plain text без скобок!
|
||
"""
|
||
field_labels = field_labels or FIELD_LABELS
|
||
label_to_field = {v: k for k, v in field_labels.items()}
|
||
result = template
|
||
|
||
# Создаём словарь для подстановки
|
||
auto_data = AutoFillDict(data, field_labels)
|
||
|
||
# 1. Сначала обрабатываем явные {placeholder}
|
||
try:
|
||
result = result.format(**auto_data)
|
||
except KeyError:
|
||
pass
|
||
|
||
# 2. Обрабатываем (Понятное название) в скобках — русские и английские алиасы
|
||
merged_pairs = list(label_to_field.items()) + BRACKET_LABEL_ALIASES_EN
|
||
for label, field_key in merged_pairs:
|
||
if field_key in data:
|
||
value = data[field_key]
|
||
value_str = str(value).strip() if value is not None and str(value).strip() else "не указан"
|
||
|
||
# Паттерн для (Вес), (Адрес забора) и т.д.
|
||
pattern = r'\(' + re.escape(label) + r'\)'
|
||
result = re.sub(pattern, value_str, result, flags=re.IGNORECASE)
|
||
|
||
return result
|
||
|
||
#=============================================================================#
|
||
#ЗАГРУЗКА ТИПОВ ПЕРЕВОЗОК
|
||
#=============================================================================
|
||
def load_shipping_types() -> List[Dict]:
|
||
"""Загружает типы перевозок из JSON-файла с нормализацией ключей и значений."""
|
||
types_path = resolve_shipping_types_path()
|
||
if not types_path:
|
||
logger.warning(
|
||
"Файл типов перевозок не найден (ожидались shipping_types.json или shipping_types(1).json)"
|
||
)
|
||
return []
|
||
|
||
try:
|
||
with open(types_path, "r", encoding="utf-8") as f:
|
||
raw_types = json.load(f)
|
||
|
||
if not isinstance(raw_types, list):
|
||
logger.error(f"Expected list in shipping_types.json, got {type(raw_types)}")
|
||
return []
|
||
|
||
normalized_types = []
|
||
for t in raw_types:
|
||
if not isinstance(t, dict):
|
||
continue
|
||
|
||
normalized = {}
|
||
for key, value in t.items():
|
||
clean_key = key.strip().strip('"').strip("'")
|
||
|
||
if clean_key == "keywords" and isinstance(value, list):
|
||
cleaned_keywords = []
|
||
for kw in value:
|
||
if isinstance(kw, str):
|
||
kw_clean = kw.strip().strip('"').strip("'").strip()
|
||
if not kw_clean:
|
||
continue
|
||
if ' ' in kw_clean and len(kw_clean) > 2000:
|
||
parts = [p.strip().strip('"').strip("'") for p in kw_clean.split() if p.strip()]
|
||
cleaned_keywords.extend(parts)
|
||
else:
|
||
cleaned_keywords.append(kw_clean)
|
||
normalized[clean_key] = cleaned_keywords
|
||
elif isinstance(value, str):
|
||
normalized[clean_key] = value.strip()
|
||
else:
|
||
normalized[clean_key] = value
|
||
|
||
if "id" not in normalized:
|
||
normalized["id"] = len(normalized_types) + 1
|
||
|
||
normalized_types.append(normalized)
|
||
|
||
# Подмешиваем обработанные (ИИ) критерии, если файл существует.
|
||
processed_map: Dict[str, str] = {}
|
||
if os.path.exists(PROCESSED_SHIPPING_TYPES_FILE):
|
||
try:
|
||
with open(PROCESSED_SHIPPING_TYPES_FILE, "r", encoding="utf-8") as pf:
|
||
raw_processed = json.load(pf)
|
||
if isinstance(raw_processed, dict):
|
||
processed_map = {str(k): str(v) for k, v in raw_processed.items() if v is not None}
|
||
except Exception as e:
|
||
logger.warning(f"Failed to load processed shipping criteria: {e}")
|
||
|
||
# criteria_ai:
|
||
# 1) если есть обработанный ИИ-результат в shipping_types_processed.json — используем его
|
||
# 2) иначе — используем исходный criteria, чтобы отчёт не "терял" пункты
|
||
for t in normalized_types:
|
||
name = t.get("name")
|
||
raw_criteria = t.get("criteria", "") if isinstance(t, dict) else ""
|
||
processed_val = None
|
||
if isinstance(name, str):
|
||
processed_val = processed_map.get(name)
|
||
|
||
# Если processed_val выглядит как "старый" короткий список (без \t),
|
||
# используем исходные критерии, чтобы не терять пункты.
|
||
if isinstance(processed_val, str) and processed_val.strip() and "\t" in processed_val:
|
||
t["criteria_ai"] = processed_val
|
||
else:
|
||
t["criteria_ai"] = raw_criteria
|
||
|
||
logger.info(f"Loaded {len(normalized_types)} shipping types")
|
||
return normalized_types
|
||
|
||
except json.JSONDecodeError as e:
|
||
logger.error(f"JSON decode error in shipping_types.json: {e}")
|
||
return []
|
||
except Exception as e:
|
||
logger.error(f"Error loading shipping types: {e}")
|
||
return []
|
||
|
||
#=============================================================================
|
||
#НОРМАЛИЗАЦИЯ КЛЮЧЕЙ СЛОВАРЯ
|
||
#=============================================================================
|
||
def _normalize_dict_keys(data: Any) -> Any:
|
||
"""Рекурсивно нормализует ключи словаря — убирает пробелы и кавычки"""
|
||
if isinstance(data, dict):
|
||
return {k.strip().strip('"').strip("'"): _normalize_dict_keys(v) for k, v in data.items()}
|
||
elif isinstance(data, list):
|
||
return [_normalize_dict_keys(item) for item in data]
|
||
else:
|
||
return data
|
||
|
||
#=============================================================================
|
||
#RAG-ДВИЖОК
|
||
#=============================================================================
|
||
class RAGEngineGemini:
|
||
"""RAG-движок для анализа писем о грузоперевозках и генерации ответов."""
|
||
|
||
def __init__(self, api_base_url: str = "http://localhost:8090/v1"):
|
||
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||
if not openai_api_key:
|
||
raise ValueError("OPENAI_API_KEY not found in environment variables")
|
||
|
||
self.openai_client = OpenAI(
|
||
api_key=openai_api_key,
|
||
base_url=api_base_url
|
||
)
|
||
self.sessions: Dict[str, List[Dict]] = {}
|
||
self.doc_processor = DocumentProcessor()
|
||
self.shipping_types = load_shipping_types()
|
||
logger.info(f"RAGEngineGemini initialized with {len(self.shipping_types)} shipping types")
|
||
|
||
def reload_shipping_types(self) -> None:
|
||
"""Reload shipping types from disk (used after admin updates)."""
|
||
self.shipping_types = load_shipping_types()
|
||
logger.info(f"Shipping types reloaded: {len(self.shipping_types)} types")
|
||
|
||
def _fallback_process_shipping_type_criteria(self, criteria_text: str) -> str:
|
||
"""
|
||
Детерминированная эвристика, если ИИ не доступен.
|
||
Возвращает короткий список критериев в формате: "A; B; C".
|
||
"""
|
||
if not isinstance(criteria_text, str) or not criteria_text.strip():
|
||
return ""
|
||
text = criteria_text.lower()
|
||
|
||
label_rules = [
|
||
("название клиента" in text, "Клиент"),
|
||
(("код тн вэд" in text) or ("код тн" in text), "Код ТН ВЭД"),
|
||
(("инкотерм" in text), "Условия поставки Incoterms"),
|
||
(("адрес забора" in text) or ("порт отправления" in text) or ("порт погрузки" in text), "Адрес забора груза"),
|
||
(("адрес доставки" in text) or ("порт назначения" in text) or ("порт выгрузки" in text), "Адрес доставки"),
|
||
(("стоимость груза" in text), "Стоимость груза"),
|
||
(("количество грузовых мест" in text) or ("количество мест" in text), "Количество грузовых мест"),
|
||
(("общий вес" in text), "Вес груза"),
|
||
(
|
||
("габарит" in text)
|
||
and (
|
||
"транспортн" in text
|
||
or "машин" in text
|
||
or "средств" in text
|
||
or "кузов" in text
|
||
or "прицеп" in text
|
||
or "полуприцеп" in text
|
||
),
|
||
"Габариты машины",
|
||
),
|
||
(("габарит" in text), "Габариты"),
|
||
(("общий объ" in text) or ("общий объем" in text) or ("объём" in criteria_text.lower()), "Объём"),
|
||
(("характер груза" in text) or ("наименование груза" in text), "Характер груза"),
|
||
(("опасные вещества" in text) or ("опасные свойства" in text) or ("batter" in text), "Опасные свойства"),
|
||
(("msds" in text) or ("паспорт безопасности" in text), "Требуется MSDS"),
|
||
(("dgm" in text) or ("dangerous goods management" in text), "Требуется DGM"),
|
||
(("авторизац" in text), "Разрешение бренда"),
|
||
(("необходимо указать название бренда" in text) or ("название бренда" in text), "Бренд"),
|
||
(("экспортн" in text) and ("лиценз" in text) or ("экспортная лиценз" in text) or ("экспортной лиценз" in text), "Лицензия экспортёра"),
|
||
(("температурный режим" in text) or ("температур" in text), "Температурный режим"),
|
||
]
|
||
|
||
labels: List[str] = [label for ok, label in label_rules if ok]
|
||
# убираем дубликаты, сохраняя порядок
|
||
deduped = list(dict.fromkeys(labels))
|
||
if not deduped:
|
||
return ""
|
||
return "; ".join(deduped)
|
||
def _enrich_special_requirements(self, shipment: Dict, sources: List[Dict]) -> None:
|
||
if not isinstance(shipment, dict):
|
||
return
|
||
raw_text = self._collect_shipment_source_text(shipment, sources)
|
||
if not raw_text.strip():
|
||
return
|
||
|
||
# Ключевые паттерны: обязательно содержат слова штрафа / срока / транзита
|
||
patterns = [
|
||
# "транзитный срок до 25 дней под зеркальные штрафы 250 USD"
|
||
r"(?:транзитн\w*\s+)?срок\w*\s+(?:до\s+)?\d+\s*(?:дн\w*|day\w*).{0,60}?(?:зеркальн\w*\s+)?штраф\w*\s*\d+\s*(?:USD|доллар\w*|руб\w*|EUR)?",
|
||
# "зеркальные штрафы 250 USD"
|
||
r"зеркальн\w*\s+штраф\w*\s*\d+\s*(?:USD|доллар\w*|руб\w*|EUR)?",
|
||
# "штраф за просрочку 100 USD в день"
|
||
r"(?:штраф|penalty|неустойк\w*|пени)\s*(?:за\s+)?(?:просрочк\w*|опоздани\w*|срыв\s+срок\w*).{0,100}?\d+\s*(?:USD|доллар\w*|руб\w*|EUR|дн\w*)",
|
||
# "срок доставки до 20 дней, иначе штраф 500"
|
||
r"срок\s+доставк\w*\s*(?:до\s+)?\d+\s*(?:дн\w*|day\w*).*?(?:штраф|неустойк|пеня|penalty)",
|
||
# "срок доставки 25 дней" / "delivery time 25 days"
|
||
r"(?:срок|транзитн\w*\s+срок)\s+доставк\w*\s*(?:до\s+)?\d+\s*(?:дн\w*|day\w*)",
|
||
r"(?:delivery|transit)\s*time\s*(?:up\s*to\s*)?\d+\s*day\w*",
|
||
# "liquidated damages 1000 USD"
|
||
r"(?:liquidated\s+damages|ld\s*clause).{0,100}?\d+\s*(?:USD|доллар|руб|EUR)",
|
||
# "жёсткий срок до 15.05.2025"
|
||
r"доставк\w*\s+(?:строго|жёстко|обязательно)\s+(?:до|к|не\s+позднее)\s+\d{1,2}\.\d{1,2}(?:\.\d{2,4})?",
|
||
]
|
||
|
||
found_phrases: List[str] = []
|
||
for pat in patterns:
|
||
for m in re.finditer(pat, raw_text, re.IGNORECASE | re.DOTALL):
|
||
phrase = re.sub(r'\s+', ' ', m.group(0).strip())
|
||
if phrase not in found_phrases:
|
||
found_phrases.append(phrase)
|
||
|
||
if not found_phrases:
|
||
return
|
||
|
||
# Объединяем с уже существующим значением, избегая повторов
|
||
existing = (shipment.get("special_transport_requirements") or "").strip()
|
||
if existing:
|
||
new_parts = [p for p in found_phrases if p.lower() not in existing.lower()]
|
||
if new_parts:
|
||
shipment["special_transport_requirements"] = existing + " | " + " | ".join(new_parts)
|
||
else:
|
||
shipment["special_transport_requirements"] = " | ".join(found_phrases)
|
||
|
||
# Расширяем доп.сервисы: сроки доставки и штрафные условия должны быть видны отдельным пунктом.
|
||
extras = shipment.get("additional_services")
|
||
if not isinstance(extras, list):
|
||
extras = []
|
||
seen = {str(x).strip().lower() for x in extras if str(x).strip()}
|
||
for phrase in found_phrases:
|
||
normalized = phrase.strip()
|
||
if not normalized:
|
||
continue
|
||
key = normalized.lower()
|
||
if key not in seen:
|
||
extras.append(normalized)
|
||
seen.add(key)
|
||
shipment["additional_services"] = extras
|
||
|
||
def process_shipping_type_criteria(self, criteria_text: str) -> str:
|
||
"""
|
||
Прогоняет criteria из shipping_type через ИИ и возвращает ОЧИЩЕННУЮ версию criteria
|
||
(с сохранением нумерованного формата), чтобы отчёт показывал ровно пункты из criteria.
|
||
"""
|
||
def _normalize_criteria_text(text: str) -> str:
|
||
if not isinstance(text, str):
|
||
return ""
|
||
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
||
normalized: List[str] = []
|
||
for idx, ln in enumerate(lines, start=1):
|
||
m = re.match(r"^\s*(\d+)[\.\)\-]?\s*(.*)$", ln)
|
||
if m:
|
||
num = m.group(1).strip()
|
||
body = m.group(2).strip()
|
||
if body:
|
||
normalized.append(f"{num}\t{body}")
|
||
else:
|
||
body = ln
|
||
normalized.append(f"{idx}\t{body}")
|
||
return "\n".join(normalized)
|
||
|
||
fallback = _normalize_criteria_text(criteria_text.strip()) if isinstance(criteria_text, str) else ""
|
||
if not isinstance(criteria_text, str) or not criteria_text.strip():
|
||
return ""
|
||
|
||
system_prompt = (
|
||
"Ты инженер по логистике и формированию отчётов. "
|
||
"Тебе дан текст критерием из поля shipping_type (обычно нумерованный список вида: \"1\\t...\\n2\\t...\"). "
|
||
"Твоя задача: сделать формулировки критериев более аккуратными и единообразными, но без потери смысла. "
|
||
"Нельзя удалять пункты или объединять их. "
|
||
"Верни строго нумерованный список в формате: <номер>\\t<критерий>, каждый пункт с новой строки. "
|
||
"Возвращай ответ строго в JSON-формате: {\"processed_criteria\": \"...\"}."
|
||
)
|
||
user_prompt = f"Исходные критерии shipping_type:\n{criteria_text}"
|
||
|
||
try:
|
||
response = self.openai_client.chat.completions.create(
|
||
model="card_generation",
|
||
messages=[
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": user_prompt},
|
||
],
|
||
temperature=0,
|
||
max_tokens=250,
|
||
)
|
||
answer_text = response.choices[0].message.content or ""
|
||
# ожидаем {"processed_criteria":"..."}
|
||
parsed = self._parse_json_response(answer_text)
|
||
processed = parsed.get("processed_criteria")
|
||
if isinstance(processed, str):
|
||
processed = _normalize_criteria_text(processed.strip())
|
||
if processed:
|
||
return processed
|
||
except Exception as e:
|
||
logger.warning(f"AI processing criteria failed, fallback to original criteria: {e}")
|
||
|
||
return fallback
|
||
|
||
async def process_outlook_emails(self, emails: List[Dict], session_id: Optional[str] = None) -> str:
|
||
try:
|
||
if session_id is None or session_id not in self.sessions:
|
||
session_id = hashlib.md5(f"{time.time()}{str(emails)}".encode()).hexdigest()[:16]
|
||
self.sessions[session_id] = []
|
||
logger.info(f"Created new session {session_id}")
|
||
|
||
for email in emails:
|
||
email_text, attachments = self._extract_email_content(email)
|
||
shipping_type = self._detect_shipping_type_from_email(email_text)
|
||
if shipping_type:
|
||
logger.info(f"Detected shipping type: {shipping_type}")
|
||
|
||
self.sessions[session_id].append({
|
||
"content": email_text,
|
||
"metadata": {
|
||
"subject": email.get("subject", ""),
|
||
"sender": email.get("sender", ""),
|
||
"sender_name": email.get("senderName", ""),
|
||
"received_time": str(email.get("receivedTime", "")),
|
||
"email_id": email.get("id"),
|
||
"to": email.get("to", ""),
|
||
"cc": email.get("cc", "")
|
||
},
|
||
"attachments": attachments,
|
||
"shipping_type": shipping_type
|
||
})
|
||
logger.info(f"Indexed email {email.get('id')} with session_id {session_id}")
|
||
|
||
return session_id
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error processing Outlook emails: {e}", exc_info=True)
|
||
raise
|
||
|
||
def _text_requests_all_shipping_types(self, text: str) -> bool:
|
||
"""
|
||
Клиент просит варианты по всем способам доставки (а не один конкретный тип).
|
||
"""
|
||
if not text or not isinstance(text, str):
|
||
return False
|
||
t = text.lower()
|
||
patterns = [
|
||
r"все\s+возможн\w*\s+тип\w*\s+(?:доставк|перевозк|отправк)",
|
||
r"все\s+тип\w*\s+(?:доставк|перевозк|отправк)",
|
||
r"все\s+вариант\w*\s+(?:доставк|перевозк|доставки|перевозки)",
|
||
r"все\s+способ\w*\s+доставк",
|
||
r"по\s+всем\s+тип\w*\s+(?:доставк|перевозк)",
|
||
r"по\s+всем\s+способ\w*\s+доставк",
|
||
r"люб\w*\s+(?:из\s+)?тип\w*\s+доставк",
|
||
r"люб\w*\s+способ\w*\s+доставк",
|
||
r"люб\w*\s+вариант\w*\s+доставк",
|
||
r"все\s+канал\w*\s+доставк",
|
||
r"все\s+модальност\w*",
|
||
r"люб\w*\s+вид\w*\s+доставк",
|
||
r"all\s+(?:possible\s+)?types?\s+of\s+(?:delivery|shipping|transport|freight)",
|
||
r"all\s+(?:shipping|delivery)\s+options?",
|
||
r"any\s+(?:type\s+of\s+)?(?:delivery|shipping|freight)",
|
||
r"every\s+(?:shipping|delivery)\s+(?:type|option|mode)",
|
||
]
|
||
return any(re.search(p, t, re.IGNORECASE) for p in patterns)
|
||
|
||
def _detect_shipping_type_from_email(self, email_text: str) -> Optional[str]:
|
||
"""Тип перевозки выбирает основной LLM; при индексации писем не подставляем тип по ключевым словам."""
|
||
return None
|
||
|
||
def _extract_emails_from_text(self, raw: Any) -> List[str]:
|
||
if not raw:
|
||
return []
|
||
text = str(raw).lower()
|
||
return list(dict.fromkeys(re.findall(r"[a-z0-9._%+\-]+@[a-z0-9.\-]+\.[a-z]{2,}", text)))
|
||
|
||
def _recipients_for_shipment(self, shipment: Dict, sources: List[Dict]) -> set[str]:
|
||
if not isinstance(shipment, dict) or not isinstance(sources, list):
|
||
return set()
|
||
email_ids = set(shipment.get("ID_emails") or [])
|
||
if not email_ids:
|
||
return set()
|
||
recipients: set[str] = set()
|
||
for src in sources:
|
||
if not isinstance(src, dict):
|
||
continue
|
||
if src.get("id") not in email_ids:
|
||
continue
|
||
recipients.update(self._extract_emails_from_text(src.get("to")))
|
||
recipients.update(self._extract_emails_from_text(src.get("cc")))
|
||
return recipients
|
||
|
||
def _shipping_types_matching_recipients(self, recipients: set[str]) -> List[Dict]:
|
||
if not recipients or not self.shipping_types:
|
||
return []
|
||
matches: List[Dict] = []
|
||
for st in self.shipping_types:
|
||
if not isinstance(st, dict):
|
||
continue
|
||
raw = st.get("employee_email") or ""
|
||
bound_emails = set(self._extract_emails_from_text(raw))
|
||
if bound_emails and (bound_emails & recipients):
|
||
matches.append(st)
|
||
return matches
|
||
|
||
def _pick_recipient_shipping_type(
|
||
self, recipients: set[str], combined_text: str
|
||
) -> Optional[Dict]:
|
||
"""
|
||
Если на ящик заведено несколько типов — без скоринга по тексту: первый совпавший
|
||
в порядке записей в shipping_types (combined_text зарезервирован под будущие правила).
|
||
"""
|
||
_ = combined_text
|
||
matches = self._shipping_types_matching_recipients(recipients)
|
||
if not matches:
|
||
return None
|
||
return matches[0]
|
||
|
||
def _shipping_type_by_recipient(self, recipients: set[str]) -> Optional[Dict]:
|
||
m = self._shipping_types_matching_recipients(recipients)
|
||
return m[0] if m else None
|
||
|
||
def normalize_shipment(self, shipment: dict, all_shipments: list | None = None) -> dict:
|
||
"""
|
||
Унифицирует и исправляет данные shipment:
|
||
- Китай → обязательная авторизация бренда
|
||
- None вместо ложных false
|
||
- корректная логика MSDS / DGM (включая "не предоставляет")
|
||
- перенос общих полей (штабелирование и т.д.)
|
||
- таможня → только место
|
||
"""
|
||
|
||
if not isinstance(shipment, dict):
|
||
return shipment
|
||
|
||
# =========================================================
|
||
# 1. Утилиты
|
||
# =========================================================
|
||
def is_empty(val):
|
||
return val is None or (isinstance(val, str) and not val.strip())
|
||
|
||
# =========================================================
|
||
# 2. Китай / бренд → авторизационное письмо
|
||
# =========================================================
|
||
pickup = shipment.get("pickup_address") or ""
|
||
loading = shipment.get("loading_port") or ""
|
||
delivery = shipment.get("delivery_address") or ""
|
||
cargo_desc = shipment.get("cargo_description") or ""
|
||
brand = shipment.get("brand_name")
|
||
|
||
loc_for_china = " ".join(str(x) for x in (pickup, loading, delivery, cargo_desc) if x)
|
||
if (
|
||
brand
|
||
and isinstance(brand, str)
|
||
and brand.strip()
|
||
and self._text_mentions_china_context(loc_for_china)
|
||
):
|
||
shipment["brand_authorization_letter"] = True
|
||
|
||
# =========================================================
|
||
# 3. Батарейки (важно: None != False)
|
||
# =========================================================
|
||
dg = shipment.get("dangerous_goods")
|
||
|
||
if isinstance(dg, dict):
|
||
# если вообще ничего не указано (нет ни True, ни False) → всё в None
|
||
# Важно: явные False (non-DG) сохраняем, не затираем.
|
||
has_true = any(v is True for v in dg.values())
|
||
has_false = any(v is False for v in dg.values())
|
||
if not has_true and not has_false:
|
||
shipment["dangerous_goods"] = {
|
||
"batteries": None,
|
||
"gases": None,
|
||
"liquids": None,
|
||
"dry_ice": None
|
||
}
|
||
|
||
# batteries_packed_separately
|
||
if shipment.get("batteries_packed_separately") is False:
|
||
# если батарейки вообще не указаны → должно быть None
|
||
if not isinstance(dg, dict) or dg.get("batteries") is not True:
|
||
shipment["batteries_packed_separately"] = None
|
||
|
||
# =========================================================
|
||
# 4. MSDS / DGM логика
|
||
# =========================================================
|
||
docs = shipment.get("documents_found", {}) or {}
|
||
|
||
# MSDS
|
||
if shipment.get("msds_required"):
|
||
if not docs.get("msds"):
|
||
shipment["msds_status"] = "not_provided"
|
||
else:
|
||
shipment["msds_status"] = "provided"
|
||
else:
|
||
shipment["msds_status"] = None
|
||
|
||
# DGM
|
||
if shipment.get("dgm_report_required"):
|
||
if not docs.get("dgm"):
|
||
shipment["dgm_status"] = "not_provided"
|
||
else:
|
||
shipment["dgm_status"] = "provided"
|
||
else:
|
||
shipment["dgm_status"] = None
|
||
|
||
# =========================================================
|
||
# 5. Таможня → только место (убираем "нужно/не нужно")
|
||
# =========================================================
|
||
place = shipment.get("customs_clearance_place_export_rf")
|
||
|
||
if is_empty(place):
|
||
shipment["customs_clearance_place_export_rf"] = None
|
||
else:
|
||
shipment["customs_clearance_required"] = None # отключаем булевую логику
|
||
|
||
# =========================================================
|
||
# 6. Контейнеры (морская логика)
|
||
# =========================================================
|
||
CONTAINER_TYPES_MAP = {
|
||
# 20 футов стандарт (20DC / 20GP)
|
||
"20DC": [
|
||
"20dc", "20 dc", "20gp", "20 gp",
|
||
"20'", "20 ft", "20фт", "20 фут", "20 футов",
|
||
"контейнер 20 фут", "контейнер 20 футов",
|
||
"20 футовый контейнер", "20-футовый контейнер",
|
||
"20 фут / контейнер 20 футов",
|
||
"20f", "20 фут контейнер"
|
||
],
|
||
|
||
# 40 футов стандарт (40DC / 40GP)
|
||
"40DC": [
|
||
"40dc", "40 dc", "40gp", "40 gp",
|
||
"40'", "40 ft", "40фт", "40 фут", "40 футов",
|
||
"контейнер 40 фут", "контейнер 40 футов",
|
||
"40 футовый контейнер", "40-футовый контейнер",
|
||
"40f", "40 фут контейнер"
|
||
],
|
||
|
||
# 40 футов High Cube
|
||
"40HC": [
|
||
"40hc", "40 hc", "40hq", "40 hq",
|
||
"high cube", "hc",
|
||
"40 фут hc", "40 футов hc",
|
||
"40 футов высокий", "высокий контейнер 40",
|
||
"40 футов high cube",
|
||
"40hc контейнер"
|
||
],
|
||
|
||
# 45 футов High Cube
|
||
"45HC": [
|
||
"45hc", "45 hc",
|
||
"45'", "45 ft", "45 фут", "45 футов",
|
||
"контейнер 45 фут", "45 футовый контейнер",
|
||
"45 футов high cube"
|
||
],
|
||
|
||
# LCL / сборный груз
|
||
"LCL": [
|
||
"lcl", "сборный", "сборный груз",
|
||
"частичная загрузка", "менее контейнера",
|
||
"less than container load"
|
||
],
|
||
|
||
# FCL (полный контейнер)
|
||
"FCL": [
|
||
"fcl", "полный контейнер", "целый контейнер",
|
||
"full container load"
|
||
]
|
||
}
|
||
|
||
|
||
def detect_container_type(text: str) -> str | None:
|
||
text = text.lower()
|
||
|
||
for normalized, variants in CONTAINER_TYPES_MAP.items():
|
||
for variant in variants:
|
||
if variant in text:
|
||
return normalized
|
||
|
||
return None
|
||
|
||
|
||
container = shipment.get("container_type")
|
||
|
||
if not container:
|
||
# не завязываемся на FCL/LCL — просто оставляем None
|
||
shipment["container_type"] = None
|
||
|
||
# =========================================================
|
||
# 7. Штабелирование — не копируем между разными перевозками в одном отчёте
|
||
# =========================================================
|
||
if (
|
||
all_shipments
|
||
and isinstance(all_shipments, list)
|
||
and len(all_shipments) == 1
|
||
):
|
||
base = all_shipments[0]
|
||
|
||
for field in ["stackable_with_others", "stackable_among_themselves"]:
|
||
if shipment.get(field) is None:
|
||
shipment[field] = base.get(field)
|
||
|
||
# =========================================================
|
||
# 8. Габариты → гарантируем список
|
||
# =========================================================
|
||
dims = shipment.get("dimensions")
|
||
|
||
if not isinstance(dims, list):
|
||
shipment["dimensions"] = []
|
||
else:
|
||
# Нормализуем ключи/единицы и убираем неполные варианты.
|
||
# В некоторых ответах ИИ размеры приходят как length/width/height или в *_mm.
|
||
def _to_float(v: Any) -> Optional[float]:
|
||
if isinstance(v, (int, float)):
|
||
return float(v)
|
||
if isinstance(v, str):
|
||
s = v.strip().replace(" ", "")
|
||
s = s.replace(",", ".")
|
||
try:
|
||
return float(s)
|
||
except Exception:
|
||
return None
|
||
return None
|
||
|
||
def _normalize_one_dim(d: Any) -> Optional[Dict[str, float]]:
|
||
if not isinstance(d, dict):
|
||
return None
|
||
|
||
# Если LLM уже заполнил именно *_cm поля, считаем их первичным источником.
|
||
# Это защищает от двойной конвертации (например 107 см -> 10.7 см при unit=mm).
|
||
has_explicit_cm_keys = any(
|
||
d.get(k) is not None for k in ("length_cm", "width_cm", "height_cm")
|
||
)
|
||
l_cm = _to_float(d.get("length_cm"))
|
||
w_cm = _to_float(d.get("width_cm"))
|
||
h_cm = _to_float(d.get("height_cm"))
|
||
|
||
# Если пришли *_mm — переводим в см.
|
||
used_mm = False
|
||
if l_cm is None and d.get("length_mm") is not None:
|
||
l_mm = _to_float(d.get("length_mm"))
|
||
if l_mm is not None:
|
||
l_cm = l_mm / 10.0
|
||
used_mm = True
|
||
if w_cm is None and d.get("width_mm") is not None:
|
||
w_mm = _to_float(d.get("width_mm"))
|
||
if w_mm is not None:
|
||
w_cm = w_mm / 10.0
|
||
used_mm = True
|
||
if h_cm is None and d.get("height_mm") is not None:
|
||
h_mm = _to_float(d.get("height_mm"))
|
||
if h_mm is not None:
|
||
h_cm = h_mm / 10.0
|
||
used_mm = True
|
||
|
||
# Если пришло без суффикса — считаем, что это см, но при явных mm-сайзах конвертим.
|
||
if l_cm is None and d.get("length") is not None:
|
||
l_cm = _to_float(d.get("length"))
|
||
if w_cm is None and d.get("width") is not None:
|
||
w_cm = _to_float(d.get("width"))
|
||
if h_cm is None and d.get("height") is not None:
|
||
h_cm = _to_float(d.get("height"))
|
||
|
||
if l_cm is None or w_cm is None or h_cm is None:
|
||
return None
|
||
|
||
unit_key = _normalize_dimension_unit_field(
|
||
d.get("dimension_unit") or d.get("unit") or d.get("dimensions_unit")
|
||
)
|
||
if not used_mm:
|
||
if has_explicit_cm_keys:
|
||
# Для *_cm не применяем unit=mm повторно.
|
||
# Исключение: явно аномально большие значения — вероятно, это всё же мм.
|
||
mx = max(l_cm, w_cm, h_cm)
|
||
if mx >= 1000:
|
||
l_cm, w_cm, h_cm = infer_cargo_triple_raw_to_cm(
|
||
l_cm, w_cm, h_cm, explicit_unit="mm"
|
||
)
|
||
elif unit_key is None:
|
||
l_cm, w_cm, h_cm = infer_cargo_triple_raw_to_cm(l_cm, w_cm, h_cm)
|
||
elif unit_key == "mm":
|
||
l_cm, w_cm, h_cm = infer_cargo_triple_raw_to_cm(
|
||
l_cm, w_cm, h_cm, explicit_unit="mm"
|
||
)
|
||
elif unit_key == "m":
|
||
l_cm, w_cm, h_cm = infer_cargo_triple_raw_to_cm(
|
||
l_cm, w_cm, h_cm, explicit_unit="m"
|
||
)
|
||
elif unit_key == "cm":
|
||
pass
|
||
|
||
# Финальная валидация (не допускаем нули/отрицательные).
|
||
if l_cm <= 0 or w_cm <= 0 or h_cm <= 0:
|
||
return None
|
||
|
||
return {
|
||
"length_cm": round(l_cm, 4),
|
||
"width_cm": round(w_cm, 4),
|
||
"height_cm": round(h_cm, 4),
|
||
}
|
||
|
||
# Дедупликация по (Д,Ш,В)
|
||
cleaned: List[Dict[str, float]] = []
|
||
seen: set[tuple[float, float, float]] = set()
|
||
for d in dims:
|
||
norm = _normalize_one_dim(d)
|
||
if not norm:
|
||
continue
|
||
key = (
|
||
round(norm["length_cm"], 1),
|
||
round(norm["width_cm"], 1),
|
||
round(norm["height_cm"], 1),
|
||
)
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
cleaned.append(norm)
|
||
|
||
shipment["dimensions"] = cleaned
|
||
|
||
# =========================================================
|
||
# 8b. Габариты ТС (машина/кузов) — отдельно от груза
|
||
# =========================================================
|
||
vdims = shipment.get("vehicle_dimensions")
|
||
if not isinstance(vdims, list):
|
||
shipment["vehicle_dimensions"] = []
|
||
else:
|
||
def _to_float_v(v: Any) -> Optional[float]:
|
||
if isinstance(v, (int, float)):
|
||
return float(v)
|
||
if isinstance(v, str):
|
||
s = v.strip().replace(" ", "").replace(",", ".")
|
||
try:
|
||
return float(s)
|
||
except Exception:
|
||
return None
|
||
return None
|
||
|
||
def _normalize_vehicle_dim(d: Any) -> Optional[Dict[str, float]]:
|
||
if not isinstance(d, dict):
|
||
return None
|
||
lm = _to_float_v(d.get("length_m"))
|
||
wm = _to_float_v(d.get("width_m"))
|
||
hm = _to_float_v(d.get("height_m"))
|
||
if lm is not None and wm is not None and hm is not None:
|
||
l_cm, w_cm, h_cm = lm * 100.0, wm * 100.0, hm * 100.0
|
||
else:
|
||
l_cm = _to_float_v(d.get("length_cm"))
|
||
w_cm = _to_float_v(d.get("width_cm"))
|
||
h_cm = _to_float_v(d.get("height_cm"))
|
||
used_mm = False
|
||
if l_cm is None and d.get("length_mm") is not None:
|
||
l_mm = _to_float_v(d.get("length_mm"))
|
||
if l_mm is not None:
|
||
l_cm = l_mm / 10.0
|
||
used_mm = True
|
||
if w_cm is None and d.get("width_mm") is not None:
|
||
w_mm = _to_float_v(d.get("width_mm"))
|
||
if w_mm is not None:
|
||
w_cm = w_mm / 10.0
|
||
used_mm = True
|
||
if h_cm is None and d.get("height_mm") is not None:
|
||
h_mm = _to_float_v(d.get("height_mm"))
|
||
if h_mm is not None:
|
||
h_cm = h_mm / 10.0
|
||
used_mm = True
|
||
if l_cm is None and d.get("length") is not None:
|
||
l_cm = _to_float_v(d.get("length"))
|
||
if w_cm is None and d.get("width") is not None:
|
||
w_cm = _to_float_v(d.get("width"))
|
||
if h_cm is None and d.get("height") is not None:
|
||
h_cm = _to_float_v(d.get("height"))
|
||
if l_cm is None or w_cm is None or h_cm is None:
|
||
return None
|
||
# Не применяем эвристику мм↔см по max>120: у ТС длина часто >120 см.
|
||
if not used_mm and max(l_cm, w_cm, h_cm) <= 30 and min(l_cm, w_cm, h_cm) >= 0.3:
|
||
l_cm *= 100.0
|
||
w_cm *= 100.0
|
||
h_cm *= 100.0
|
||
if l_cm <= 0 or w_cm <= 0 or h_cm <= 0:
|
||
return None
|
||
return {
|
||
"length_cm": round(l_cm, 4),
|
||
"width_cm": round(w_cm, 4),
|
||
"height_cm": round(h_cm, 4),
|
||
}
|
||
|
||
cleaned_v: List[Dict[str, float]] = []
|
||
seen_v: set[tuple[float, float, float]] = set()
|
||
for d in vdims:
|
||
norm = _normalize_vehicle_dim(d)
|
||
if not norm:
|
||
continue
|
||
key = (
|
||
round(norm["length_cm"], 1),
|
||
round(norm["width_cm"], 1),
|
||
round(norm["height_cm"], 1),
|
||
)
|
||
if key in seen_v:
|
||
continue
|
||
seen_v.add(key)
|
||
cleaned_v.append(norm)
|
||
shipment["vehicle_dimensions"] = cleaned_v
|
||
|
||
# =========================================================
|
||
# 9. Таможка не упоминается → None (не True!)
|
||
# =========================================================
|
||
if shipment.get("customs_clearance_required") is True:
|
||
if is_empty(place):
|
||
shipment["customs_clearance_required"] = None
|
||
|
||
# =========================================================
|
||
# 10. Доп. защита от "галлюцинаций"
|
||
# =========================================================
|
||
for key in [
|
||
"msds_required",
|
||
"dgm_report_required",
|
||
"brand_authorization_letter",
|
||
"exporter_has_export_license"
|
||
]:
|
||
if shipment.get(key) not in [True, False]:
|
||
shipment[key] = None
|
||
|
||
return shipment
|
||
|
||
def _normalize_text_key(self, value: Any) -> str:
|
||
if not isinstance(value, str):
|
||
return ""
|
||
# Нормализуем для группировки: регистр/пробелы/знаки препинания.
|
||
return re.sub(r"[\s,.;:()\-_/]+", " ", value.lower()).strip()
|
||
|
||
def _merge_bool_values(self, a: Any, b: Any) -> Any:
|
||
# Не додумываем: если есть конфликт true/false -> None.
|
||
if a is None:
|
||
return b
|
||
if b is None:
|
||
return a
|
||
if bool(a) == bool(b):
|
||
return bool(a)
|
||
return None
|
||
|
||
def _cargo_ready_date_parts(self, val: Any) -> List[str]:
|
||
if val is None:
|
||
return []
|
||
if isinstance(val, list):
|
||
return [str(x).strip() for x in val if x is not None and str(x).strip()]
|
||
s = str(val).strip()
|
||
if not s:
|
||
return []
|
||
return [p.strip() for p in re.split(r"\s*,\s*|\s*;\s*", s) if p.strip()]
|
||
|
||
def _merge_cargo_ready_date(self, a: Any, b: Any) -> str:
|
||
merged = list(dict.fromkeys(self._cargo_ready_date_parts(a) + self._cargo_ready_date_parts(b)))
|
||
return ", ".join(merged)
|
||
|
||
def _merge_shipment_into(self, base: Dict, s: Dict) -> None:
|
||
"""Сливает поля перевозки s в base (изменяет base)."""
|
||
if not isinstance(base, dict) or not isinstance(s, dict):
|
||
return
|
||
|
||
ids = list(dict.fromkeys((base.get("ID_emails") or []) + (s.get("ID_emails") or [])))
|
||
base["ID_emails"] = ids
|
||
|
||
pickups = []
|
||
for p in [base.get("pickup_address"), s.get("pickup_address")]:
|
||
if isinstance(p, str) and p.strip():
|
||
pickups.extend([x.strip() for x in p.split(" | ") if x.strip()])
|
||
unique_pickups_list = list(dict.fromkeys(pickups))
|
||
base["pickup_address"] = " | ".join(unique_pickups_list) if unique_pickups_list else ""
|
||
|
||
for num_key in ["package_count", "total_weight_kg", "total_volume_cbm"]:
|
||
a = base.get(num_key)
|
||
b = s.get(num_key)
|
||
if isinstance(a, (int, float)) and isinstance(b, (int, float)):
|
||
base[num_key] = a + b
|
||
elif a is None and isinstance(b, (int, float)):
|
||
base[num_key] = b
|
||
|
||
base_dims = base.get("dimensions") if isinstance(base.get("dimensions"), list) else []
|
||
cur_dims = s.get("dimensions") if isinstance(s.get("dimensions"), list) else []
|
||
base["dimensions"] = base_dims + cur_dims
|
||
|
||
base_vd = base.get("vehicle_dimensions") if isinstance(base.get("vehicle_dimensions"), list) else []
|
||
cur_vd = s.get("vehicle_dimensions") if isinstance(s.get("vehicle_dimensions"), list) else []
|
||
base["vehicle_dimensions"] = base_vd + cur_vd
|
||
|
||
for list_key in ["additional_services", "shipping_options", "shipping_type_candidates"]:
|
||
base_list = base.get(list_key) if isinstance(base.get(list_key), list) else []
|
||
cur_list = s.get(list_key) if isinstance(s.get(list_key), list) else []
|
||
merged = []
|
||
seen = set()
|
||
for item in base_list + cur_list:
|
||
marker = json.dumps(item, ensure_ascii=False, sort_keys=True, default=str)
|
||
if marker in seen:
|
||
continue
|
||
seen.add(marker)
|
||
merged.append(item)
|
||
base[list_key] = merged
|
||
|
||
def _name_list_union(a_raw: Any, b_raw: Any) -> List[str]:
|
||
def _parts(x: Any) -> List[str]:
|
||
if x is None:
|
||
return []
|
||
if isinstance(x, str):
|
||
return [x.strip()] if x.strip() else []
|
||
if isinstance(x, list):
|
||
return [str(z).strip() for z in x if isinstance(z, str) and str(z).strip()]
|
||
return []
|
||
|
||
return list(dict.fromkeys(_parts(a_raw) + _parts(b_raw)))
|
||
|
||
r_merged = _name_list_union(
|
||
base.get("requested_shipping_type_names"), s.get("requested_shipping_type_names")
|
||
)
|
||
if r_merged:
|
||
base["requested_shipping_type_names"] = r_merged
|
||
|
||
dg_keys = ["batteries", "gases", "liquids", "dry_ice"]
|
||
dg_a = base.get("dangerous_goods") if isinstance(base.get("dangerous_goods"), dict) else {}
|
||
dg_b = s.get("dangerous_goods") if isinstance(s.get("dangerous_goods"), dict) else {}
|
||
dg_merged = {}
|
||
for k in dg_keys:
|
||
va = dg_a.get(k)
|
||
vb = dg_b.get(k)
|
||
if va is True or vb is True:
|
||
dg_merged[k] = True
|
||
elif va is False or vb is False:
|
||
dg_merged[k] = False
|
||
else:
|
||
dg_merged[k] = None
|
||
base["dangerous_goods"] = dg_merged
|
||
|
||
for bool_key in [
|
||
"stackable_with_others", "stackable_among_themselves",
|
||
"msds_required", "batteries_packed_separately", "dgm_report_required",
|
||
"brand_authorization_letter", "document_replacement_needed",
|
||
"transshipment_with_third_country",
|
||
"exporter_has_export_license", "customs_clearance_required",
|
||
"fumigation_on_wooden_packaging"
|
||
]:
|
||
base[bool_key] = self._merge_bool_values(base.get(bool_key), s.get(bool_key))
|
||
|
||
cr_merged = self._merge_cargo_ready_date(base.get("cargo_ready_date"), s.get("cargo_ready_date"))
|
||
base["cargo_ready_date"] = cr_merged
|
||
|
||
for text_key in [
|
||
"client_name", "incoterms", "cargo_value", "cargo_description", "hs_code",
|
||
"brand_name", "dangerous_goods_note", "arrival_expediting_responsibility",
|
||
"special_transport_requirements",
|
||
"customs_clearance_place_export_rf", "shipping_type", "criteria", "criteria_preview",
|
||
"delivery_address", "loading_port", "discharge_port",
|
||
"container_type", "vehicle_type",
|
||
]:
|
||
a = (base.get(text_key) or "").strip() if isinstance(base.get(text_key), str) else ""
|
||
b = (s.get(text_key) or "").strip() if isinstance(s.get(text_key), str) else ""
|
||
if not a and b:
|
||
base[text_key] = b
|
||
elif a and b and a != b:
|
||
vals = list(dict.fromkeys([x.strip() for x in (a + " | " + b).split(" | ") if x.strip()]))
|
||
base[text_key] = " | ".join(vals)
|
||
|
||
def _collapse_shipments_single_source(self, shipments: List[Dict], sources: List[Dict]) -> List[Dict]:
|
||
"""
|
||
Если анализируется одно письмо, а модель вернула несколько shipments — мы не обязаны
|
||
склеивать всё в одну запись.
|
||
Склеиваем только те shipments, которые выглядят как одна и та же партия груза (нет конфликтов
|
||
по смысловым якорям: бренд/HS/описание груза/даты готовности/документы и т.д.).
|
||
"""
|
||
if not isinstance(shipments, list) or len(shipments) <= 1:
|
||
return shipments
|
||
if not isinstance(sources, list) or len(sources) != 1:
|
||
return shipments
|
||
|
||
# Доп. эвристика: если LLM вернул один shipment, но в письме указано несколько
|
||
# явных кодов позиций (SKU/модель/артикул), попробуем разделить на несколько shipments.
|
||
# Важно: делаем это ТОЛЬКО когда числовые поля (вес/кол-во/габариты) не готовы к корректному распределению.
|
||
if len(shipments) == 1:
|
||
only = shipments[0] if isinstance(shipments[0], dict) else None
|
||
if isinstance(only, dict):
|
||
split_by_codes = self._split_single_shipment_by_position_codes(only, sources[0])
|
||
if isinstance(split_by_codes, list) and len(split_by_codes) > 1:
|
||
logger.info(
|
||
"Split one shipment into %d by cargo position codes (single-email session)",
|
||
len(split_by_codes),
|
||
)
|
||
return split_by_codes
|
||
|
||
def _norm_info_str(v: Any) -> str:
|
||
if isinstance(v, list):
|
||
parts = []
|
||
for x in v:
|
||
s = str(x).strip() if x is not None else ""
|
||
if s:
|
||
parts.append(s)
|
||
v = ", ".join(parts)
|
||
|
||
if not isinstance(v, str):
|
||
return ""
|
||
s = v.strip()
|
||
if not s:
|
||
return ""
|
||
low = s.lower()
|
||
if low in {"не указано", "информация отсутствует", "нет", "unknown", "n/a", "na", "none"}:
|
||
return ""
|
||
return s
|
||
|
||
def _bool_or_none(v: Any) -> Optional[bool]:
|
||
if v is True:
|
||
return True
|
||
if v is False:
|
||
return False
|
||
return None
|
||
|
||
def _any_true_dg(dg: Any) -> bool:
|
||
if not isinstance(dg, dict):
|
||
return False
|
||
for k in ("batteries", "gases", "liquids", "dry_ice"):
|
||
if dg.get(k) is True:
|
||
return True
|
||
return False
|
||
|
||
def _conflict_score(a: Dict, b: Dict) -> int:
|
||
score = 0
|
||
|
||
a_brand = _norm_info_str(a.get("brand_name"))
|
||
b_brand = _norm_info_str(b.get("brand_name"))
|
||
if a_brand and b_brand and a_brand != b_brand:
|
||
score += 2
|
||
|
||
a_hs = _norm_info_str(a.get("hs_code"))
|
||
b_hs = _norm_info_str(b.get("hs_code"))
|
||
if a_hs and b_hs and a_hs != b_hs:
|
||
score += 2
|
||
|
||
a_cd = _norm_info_str(a.get("cargo_description"))
|
||
b_cd = _norm_info_str(b.get("cargo_description"))
|
||
if a_cd and b_cd and a_cd != b_cd:
|
||
score += 2
|
||
|
||
# Готовность к отгрузке может отличаться для разных партий
|
||
a_rd = _norm_info_str(a.get("cargo_ready_date"))
|
||
b_rd = _norm_info_str(b.get("cargo_ready_date"))
|
||
if a_rd and b_rd and a_rd != b_rd:
|
||
score += 1
|
||
|
||
# Ценность может отличаться для разных партий/позиции
|
||
a_val = _norm_info_str(a.get("cargo_value"))
|
||
b_val = _norm_info_str(b.get("cargo_value"))
|
||
if a_val and b_val and a_val != b_val:
|
||
score += 1
|
||
if _strong_invoice_value_mismatch(
|
||
str(a.get("cargo_value") or ""), str(b.get("cargo_value") or "")
|
||
):
|
||
score += 4
|
||
|
||
for key in ("msds_required", "dgm_report_required", "brand_authorization_letter"):
|
||
av = _bool_or_none(a.get(key))
|
||
bv = _bool_or_none(b.get(key))
|
||
if av is not None and bv is not None and av != bv:
|
||
score += 2
|
||
|
||
a_dg = _any_true_dg(a.get("dangerous_goods"))
|
||
b_dg = _any_true_dg(b.get("dangerous_goods"))
|
||
if a_dg != b_dg:
|
||
score += 1
|
||
|
||
return score
|
||
|
||
clusters: List[Dict] = []
|
||
for s in shipments:
|
||
if not isinstance(s, dict):
|
||
continue
|
||
placed = False
|
||
for base in clusters:
|
||
if _conflict_score(base, s) <= 1:
|
||
self._merge_shipment_into(base, s)
|
||
placed = True
|
||
break
|
||
if not placed:
|
||
clusters.append(dict(s))
|
||
|
||
logger.info(
|
||
"Clustered %d shipments (single email in session) -> %d",
|
||
len(shipments),
|
||
len(clusters),
|
||
)
|
||
return clusters
|
||
|
||
def _split_single_shipment_by_position_codes(
|
||
self, shipment: Dict, source: Dict
|
||
) -> List[Dict]:
|
||
"""
|
||
Пытается разделить один shipment на несколько, если в тексте видны несколько
|
||
разных кодов позиций (например, trocyps-26003, trocyps-26004).
|
||
|
||
Делает это только когда нет явных числовых распределений, чтобы не было риска
|
||
удвоить веса/габариты.
|
||
"""
|
||
if not isinstance(shipment, dict) or not isinstance(source, dict):
|
||
return [shipment]
|
||
|
||
head_txt = " ".join(
|
||
str(x or "")
|
||
for x in (
|
||
source.get("subject"),
|
||
source.get("content"),
|
||
)
|
||
)
|
||
if _is_tender_multi_origin_context(head_txt):
|
||
logger.info(
|
||
"Skip split by position codes: tender / multi-origin pickup context detected"
|
||
)
|
||
return [shipment]
|
||
|
||
# Числовые поля: если они заполнены, мы не знаем как корректно распределить по позициям.
|
||
# Тогда безопаснее не разделять детерминированно.
|
||
if shipment.get("total_weight_kg") not in [None, "", "не указан"]:
|
||
return [shipment]
|
||
if shipment.get("package_count") not in [None, "", "не указан"]:
|
||
return [shipment]
|
||
dims = shipment.get("dimensions")
|
||
if isinstance(dims, list) and dims:
|
||
return [shipment]
|
||
# Объём/CBM тоже лучше не делить без правил.
|
||
if shipment.get("total_volume_cbm") not in [None, "", "не указан"]:
|
||
return [shipment]
|
||
|
||
# Берём максимум текста, чтобы коды точно нашлись
|
||
blob = self._collect_shipment_source_text(shipment, [source]) or ""
|
||
|
||
# Основной кейс: trocyps-12345 / trocyps_12345 / trocyps 12345
|
||
codes = re.findall(r"\btrocyps[-_\s]?\d+\b", blob, flags=re.IGNORECASE)
|
||
if not codes:
|
||
# Общая эвристика: ABC-12345 (3+ буквы/цифры через дефис)
|
||
codes = re.findall(r"\b[A-Za-z]{2,}[-_]\d{2,}\b", blob)
|
||
|
||
# Нормализуем в нижний регистр и дедуплицируем
|
||
norm_codes: List[str] = []
|
||
seen = set()
|
||
for c in codes:
|
||
cc = str(c).strip()
|
||
if not cc:
|
||
continue
|
||
key = cc.lower()
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
norm_codes.append(cc)
|
||
|
||
if len(norm_codes) < 2:
|
||
return [shipment]
|
||
|
||
# Разрезаем cargo_description (или blob) по появлениям кодов и берём сегменты.
|
||
cargo_src = shipment.get("cargo_description")
|
||
cargo_src_str = cargo_src if isinstance(cargo_src, str) else ""
|
||
split_text = cargo_src_str.strip() or blob
|
||
|
||
# Определяем индексы в split_text
|
||
indices: List[tuple[int, str]] = []
|
||
lower_split = split_text.lower()
|
||
for code in norm_codes:
|
||
idx = lower_split.find(code.lower())
|
||
if idx >= 0:
|
||
indices.append((idx, code))
|
||
|
||
if len(indices) < 2:
|
||
return [shipment]
|
||
|
||
indices.sort(key=lambda x: x[0])
|
||
segments: List[str] = []
|
||
for i, (start, code) in enumerate(indices):
|
||
end = indices[i + 1][0] if i + 1 < len(indices) else len(split_text)
|
||
seg = split_text[start:end].strip()
|
||
segments.append(seg if seg else code)
|
||
|
||
out: List[Dict] = []
|
||
for i, seg in enumerate(segments):
|
||
new_s = dict(shipment)
|
||
# Положим в описание сегмент вокруг конкретного кода позиции
|
||
# Верхний предел только против аномально больших строк (см. RAG_MAX_SEGMENT_CHARS)
|
||
_max_seg = _max_cargo_segment_chars()
|
||
new_s["cargo_description"] = seg if _max_seg <= 0 else seg[:_max_seg]
|
||
out.append(new_s)
|
||
|
||
return out
|
||
|
||
def _merge_shipments_same_destination(
|
||
self, shipments: List[Dict], context_text: str = ""
|
||
) -> List[Dict]:
|
||
"""
|
||
Объединяет перевозки с одинаковым адресом выгрузки в одну запись.
|
||
Поля объединяются консервативно. Если по «смысловым якорям» (бренд, HS, тип, нужные документы)
|
||
видны различающиеся партии/заказа, shipments разделяются на несколько записей.
|
||
Для тендеров с несколькими адресами забора и одной выгрузкой — мягче порог склейки.
|
||
"""
|
||
if not isinstance(shipments, list) or len(shipments) <= 1:
|
||
return shipments
|
||
|
||
tender_relaxed = _is_tender_multi_origin_context(context_text or "")
|
||
merge_threshold = 4 if tender_relaxed else 1
|
||
|
||
# 1) Сначала группируем по месту выгрузки.
|
||
buckets: Dict[str, List[Dict]] = {}
|
||
passthrough: List[Dict] = []
|
||
for s in shipments:
|
||
if not isinstance(s, dict):
|
||
continue
|
||
dest = (s.get("delivery_address") or "").strip()
|
||
dest_key = self._normalize_text_key(dest)
|
||
if not dest_key:
|
||
passthrough.append(s)
|
||
continue
|
||
buckets.setdefault(dest_key, []).append(s)
|
||
|
||
# 2) Объединяем все перевозки в группе с одним и тем же нормализованным адресом выгрузки
|
||
# (две и больше записей). Сначала строим кластеры внутри группы по delivery_address.
|
||
# Правило: если есть явные конфликты по бренду/HS/типу/документам — не сливаем.
|
||
|
||
def _norm_info_str(v: Any) -> str:
|
||
if not isinstance(v, str):
|
||
return ""
|
||
s = v.strip()
|
||
if not s:
|
||
return ""
|
||
low = s.lower()
|
||
# Пустые/служебные значения, которые нельзя считать «доказательством».
|
||
if low in {"не указано", "информация отсутствует", "нет", "unknown", "n/a", "na", "none"}:
|
||
return ""
|
||
return s
|
||
|
||
def _bool_or_none(v: Any) -> Optional[bool]:
|
||
if v is True:
|
||
return True
|
||
if v is False:
|
||
return False
|
||
return None
|
||
|
||
def _any_true_dg(dg: Any) -> bool:
|
||
if not isinstance(dg, dict):
|
||
return False
|
||
for k in ("batteries", "gases", "liquids", "dry_ice"):
|
||
if dg.get(k) is True:
|
||
return True
|
||
return False
|
||
|
||
def _conflict_score(a: Dict, b: Dict) -> int:
|
||
score = 0
|
||
|
||
a_brand = _norm_info_str(a.get("brand_name"))
|
||
b_brand = _norm_info_str(b.get("brand_name"))
|
||
if a_brand and b_brand and a_brand != b_brand:
|
||
score += 2
|
||
|
||
a_hs = _norm_info_str(a.get("hs_code"))
|
||
b_hs = _norm_info_str(b.get("hs_code"))
|
||
if a_hs and b_hs and a_hs != b_hs:
|
||
score += 2
|
||
|
||
# Тип перевозки (если он распознан у обоих)
|
||
a_st = _norm_info_str(a.get("shipping_type"))
|
||
b_st = _norm_info_str(b.get("shipping_type"))
|
||
if a_st and b_st and a_st != b_st:
|
||
score += 1
|
||
|
||
# Описание груза (часто соответствует разным позициям/партиям)
|
||
a_cd = _norm_info_str(a.get("cargo_description"))
|
||
b_cd = _norm_info_str(b.get("cargo_description"))
|
||
if a_cd and b_cd and a_cd != b_cd:
|
||
score += 2
|
||
if tender_relaxed and (
|
||
("стм" in a_cd.lower() and "стм" in b_cd.lower())
|
||
or (a_cd.lower() in b_cd.lower() or b_cd.lower() in a_cd.lower())
|
||
):
|
||
score -= 1
|
||
|
||
# Готовность к отгрузке может отличаться для разных партий
|
||
a_rd = _norm_info_str(a.get("cargo_ready_date"))
|
||
b_rd = _norm_info_str(b.get("cargo_ready_date"))
|
||
if a_rd and b_rd and a_rd != b_rd:
|
||
if not tender_relaxed:
|
||
score += 1
|
||
|
||
# Ценность (если отличаемая в письме)
|
||
a_val = _norm_info_str(a.get("cargo_value"))
|
||
b_val = _norm_info_str(b.get("cargo_value"))
|
||
if a_val and b_val and a_val != b_val:
|
||
score += 1
|
||
if _strong_invoice_value_mismatch(
|
||
str(a.get("cargo_value") or ""), str(b.get("cargo_value") or "")
|
||
):
|
||
score += 4
|
||
|
||
# Документы/требования: конфликт если одно явно True, другое явно False
|
||
for key in ("msds_required", "dgm_report_required", "brand_authorization_letter"):
|
||
av = _bool_or_none(a.get(key))
|
||
bv = _bool_or_none(b.get(key))
|
||
if av is not None and bv is not None and av != bv:
|
||
score += 2
|
||
|
||
# Опасный груз: конфликт если у одного явно опасность категорий, у другого нет
|
||
a_dg = _any_true_dg(a.get("dangerous_goods"))
|
||
b_dg = _any_true_dg(b.get("dangerous_goods"))
|
||
if a_dg != b_dg:
|
||
score += 1
|
||
|
||
return score
|
||
|
||
merged: List[Dict] = []
|
||
for dest_key, group in buckets.items():
|
||
if len(group) < 2:
|
||
passthrough.extend(group)
|
||
continue
|
||
|
||
clusters: List[Dict] = []
|
||
for s in group:
|
||
if not isinstance(s, dict):
|
||
continue
|
||
placed = False
|
||
for base in clusters:
|
||
# Порог: обычно строгий; для тендера с N точками забора — выше (разные даты — норма).
|
||
if _conflict_score(base, s) <= merge_threshold:
|
||
self._merge_shipment_into(base, s)
|
||
placed = True
|
||
break
|
||
if not placed:
|
||
clusters.append(dict(s))
|
||
|
||
merged.extend(clusters)
|
||
|
||
merged = merged + passthrough
|
||
logger.info(
|
||
"Merged shipments by destination (clustered): %s -> %s (tender_relaxed=%s, thr=%s)",
|
||
len(shipments),
|
||
len(merged),
|
||
tender_relaxed,
|
||
merge_threshold,
|
||
)
|
||
return merged
|
||
|
||
def _extract_container_mentions(self, text: str) -> List[str]:
|
||
if not isinstance(text, str) or not text.strip():
|
||
return []
|
||
|
||
t = text.lower().replace("'", "").replace('"', "")
|
||
t = t.replace("ф", "f").replace("х", "x")
|
||
|
||
results: List[str] = []
|
||
|
||
def _normalize_type(size: str, kind: str) -> str:
|
||
k = (kind or "").lower().strip()
|
||
kind_map = {
|
||
"hc": "HC", "hq": "HC",
|
||
"dc": "DC", "gp": "DC",
|
||
"rf": "RF", "rh": "RF", "reefer": "RF", "ref": "RF", "реф": "RF", "рефр": "RF",
|
||
"ot": "OT",
|
||
"fr": "FR",
|
||
}
|
||
k_norm = kind_map.get(k, "")
|
||
return f"{size}{k_norm}" if k_norm else size
|
||
|
||
# Варианты с количеством: 2*40HC, 3x20DC, 2 40hc
|
||
p_count = re.compile(r"(?<!\d)(\d{1,2})\s*(?:\*|x)?\s*(20|40|45)\s*(dc|hc|hq|gp|rf|rh|reefer|ref|реф|рефр|ot|fr)\b", re.IGNORECASE)
|
||
for m in p_count.finditer(t):
|
||
cnt = m.group(1)
|
||
size = m.group(2)
|
||
kind = m.group(3)
|
||
results.append(f"{cnt}x{_normalize_type(size, kind)}")
|
||
|
||
# Одиночные: 40HC, 20DC, 45OT
|
||
p_single = re.compile(r"\b(20|40|45)\s*(dc|hc|hq|gp|rf|rh|reefer|ref|реф|рефр|ot|fr)\b", re.IGNORECASE)
|
||
for m in p_single.finditer(t):
|
||
size = m.group(1)
|
||
kind = m.group(2)
|
||
token = _normalize_type(size, kind)
|
||
if token not in results:
|
||
results.append(token)
|
||
|
||
# Явные контейнеры без типа: "контейнер 40ф"
|
||
p_foot = re.compile(r"\bконтейнер\w*\s*(20|40|45)\s*f?\b", re.IGNORECASE)
|
||
for m in p_foot.finditer(t):
|
||
token = m.group(1)
|
||
if token not in results:
|
||
results.append(token)
|
||
|
||
return list(dict.fromkeys(results))
|
||
|
||
def _infer_container_type_from_sources(self, shipment: Dict, sources: List[Dict]) -> Optional[str]:
|
||
if not isinstance(shipment, dict):
|
||
return None
|
||
email_ids = set(shipment.get("ID_emails") or [])
|
||
if not email_ids:
|
||
return None
|
||
|
||
text_chunks: List[str] = []
|
||
for src in sources or []:
|
||
if src.get("id") not in email_ids:
|
||
continue
|
||
for key in ["subject", "content"]:
|
||
val = src.get(key)
|
||
if isinstance(val, str) and val.strip():
|
||
text_chunks.append(val)
|
||
for att in src.get("attachments", []) or []:
|
||
att_text = att.get("text")
|
||
if isinstance(att_text, str) and att_text.strip():
|
||
text_chunks.append(att_text)
|
||
|
||
if not text_chunks:
|
||
return None
|
||
|
||
mentions = self._extract_container_mentions("\n".join(text_chunks))
|
||
return ", ".join(mentions) if mentions else None
|
||
|
||
def _collect_shipment_source_text(
|
||
self,
|
||
shipment: Dict,
|
||
sources: List[Dict],
|
||
*,
|
||
include_email: bool = True,
|
||
include_attachments: bool = True,
|
||
) -> str:
|
||
"""Текст по перевозке (по ID_emails) с управлением источниками."""
|
||
if not isinstance(shipment, dict):
|
||
return ""
|
||
email_ids = set(shipment.get("ID_emails") or [])
|
||
parts: List[str] = []
|
||
for src in sources or []:
|
||
if src.get("id") not in email_ids:
|
||
continue
|
||
if include_email:
|
||
for key in ("subject", "content"):
|
||
val = src.get(key)
|
||
if isinstance(val, str) and val.strip():
|
||
parts.append(val)
|
||
if include_attachments:
|
||
for att in src.get("attachments", []) or []:
|
||
att_text = att.get("text")
|
||
if isinstance(att_text, str) and att_text.strip():
|
||
parts.append(att_text)
|
||
return "\n".join(parts)
|
||
|
||
@staticmethod
|
||
def _text_mentions_china_context(text: str) -> bool:
|
||
if not isinstance(text, str) or not text.strip():
|
||
return False
|
||
t = text.lower()
|
||
markers = (
|
||
"китай", "china", "кнр", "chinese", "mainland china", "материковый китай",
|
||
"shenzhen", "shanghai", "guangzhou", "ningbo", "yiwu", "qingdao", "xiamen",
|
||
"foshan", "tianjin", "beijing", "hong kong", "гонконг", "guangdong",
|
||
"zhejiang", "jiangsu", "fujian", "义乌", "深圳", "广州", "宁波",
|
||
)
|
||
return any(m in t for m in markers)
|
||
|
||
def _infer_brand_and_authorization_from_sources(self, shipment: Dict, sources: List[Dict]) -> None:
|
||
"""Достаёт название бренда из текста писем и выставляет необходимость авторизационного письма."""
|
||
if not isinstance(shipment, dict):
|
||
return
|
||
email_blob = self._collect_shipment_source_text(
|
||
shipment, sources, include_email=True, include_attachments=False
|
||
)
|
||
att_blob = self._collect_shipment_source_text(
|
||
shipment, sources, include_email=False, include_attachments=True
|
||
)
|
||
blob = email_blob if email_blob.strip() else att_blob
|
||
if not isinstance(blob, str) or not blob.strip():
|
||
return
|
||
low = blob.lower()
|
||
|
||
bn = shipment.get("brand_name")
|
||
if not (isinstance(bn, str) and bn.strip()):
|
||
brand_patterns = [
|
||
r"(?:бренд|brand|trademark|\bтм\b)\s*[::]\s*([^\n\r,;]{1,2000})",
|
||
r"торгов(?:ая|ой)\s+марка\s*[::]\s*([^\n\r,;]{1,2000})",
|
||
r"марк[аи]\s+товара\s*[::]\s*([^\n\r,;]{1,2000})",
|
||
r"(?:производител[ья]|vendor|manufacturer|oem)\s*[::]\s*([^\n\r,;]{1,2000})",
|
||
r"logo\s*[::]\s*([^\n\r,;]{1,2000})",
|
||
]
|
||
for pat in brand_patterns:
|
||
m = re.search(pat, blob, flags=re.IGNORECASE)
|
||
if not m:
|
||
continue
|
||
raw_name = m.group(1).strip()
|
||
raw_name = raw_name.strip('«»"\'•· \t').strip()
|
||
raw_name = re.split(r"\s{2,}|\t", raw_name)[0].strip()
|
||
if not raw_name or len(raw_name) > 4000:
|
||
continue
|
||
bad = {"n/a", "na", "none", "нет", "no", "tbd", "—", "-", "same as above"}
|
||
if raw_name.lower() in bad:
|
||
continue
|
||
shipment["brand_name"] = raw_name[:4000]
|
||
break
|
||
|
||
auth_patterns = (
|
||
r"авторизационн\w*\s+письм",
|
||
r"authorization\s+letter",
|
||
r"authorisation\s+letter",
|
||
r"brand\s+authori[sz]ation",
|
||
r"loa\b|\bletter\s+of\s+authorization\b",
|
||
r"письм\w*\s+(?:от\s+)?бренд",
|
||
r"разрешени\w*\s+(?:от\s+)?бренд",
|
||
r"бренд\w*\s+в\s+таможенн",
|
||
r"таможенн\w*\s+систем\w*\s+китая",
|
||
r"китайск\w*\s+таможен",
|
||
r"регистрац\w*\s+бренд",
|
||
r"trademark\s+registration",
|
||
r"\b1688\b|1688\.com|tmall|taobao",
|
||
)
|
||
if any(re.search(p, low) for p in auth_patterns):
|
||
shipment["brand_authorization_letter"] = True
|
||
|
||
loc_blob = " ".join(
|
||
str(x)
|
||
for x in (
|
||
shipment.get("pickup_address"),
|
||
shipment.get("loading_port"),
|
||
shipment.get("delivery_address"),
|
||
shipment.get("cargo_description"),
|
||
blob,
|
||
)
|
||
if x
|
||
)
|
||
if self._text_mentions_china_context(loc_blob):
|
||
b = shipment.get("brand_name")
|
||
if isinstance(b, str) and b.strip():
|
||
shipment["brand_authorization_letter"] = True
|
||
|
||
# Текстовая сводка по пункту авторизационного письма (для вывода в отчёте/шаблонах).
|
||
lines_email = [ln.strip() for ln in email_blob.splitlines() if isinstance(ln, str) and ln.strip()]
|
||
lines_att = [ln.strip() for ln in att_blob.splitlines() if isinstance(ln, str) and ln.strip()]
|
||
auth_lines: List[str] = []
|
||
auth_need_re = re.compile(
|
||
r"авторизационн\w*\s+письм|authorization\s+letter|authorisation\s+letter|"
|
||
r"letter\s+of\s+authorization|\bloa\b|регистрац\w*\s+бренд|trademark\s+registration|"
|
||
r"бренд\w*\s+в\s+таможенн",
|
||
re.IGNORECASE,
|
||
)
|
||
auth_not_needed_re = re.compile(
|
||
r"авторизационн\w*\s+письм\w*\s+не\s+(?:нужн|треб)|"
|
||
r"authorization\s+letter\s+(?:is\s+)?not\s+required|no\s+authorization\s+letter",
|
||
re.IGNORECASE,
|
||
)
|
||
for ln in lines_email:
|
||
if auth_need_re.search(ln):
|
||
auth_lines.append(re.sub(r"\s+", " ", ln))
|
||
# Если есть явное "не требуется", это тоже важно показать.
|
||
for ln in lines_email:
|
||
if auth_not_needed_re.search(ln):
|
||
clean = re.sub(r"\s+", " ", ln)
|
||
if clean not in auth_lines:
|
||
auth_lines.append(clean)
|
||
# Только если в письме ничего не найдено — добираем релевантные строки из вложений.
|
||
if not auth_lines:
|
||
for ln in lines_att:
|
||
if auth_need_re.search(ln) or auth_not_needed_re.search(ln):
|
||
auth_lines.append("[вложение] " + re.sub(r"\s+", " ", ln))
|
||
|
||
docs = shipment.get("documents_found", {}) or {}
|
||
files = docs.get("brand_authorization") or []
|
||
if isinstance(files, list) and files:
|
||
fnames = [str(x.get("filename") or "").strip() for x in files if isinstance(x, dict)]
|
||
fnames = [f for f in fnames if f]
|
||
if fnames:
|
||
auth_lines.append("Найдены вложения: " + ", ".join(dict.fromkeys(fnames)))
|
||
|
||
if auth_lines:
|
||
shipment["brand_authorization_info"] = " | ".join(dict.fromkeys(auth_lines))[:3000]
|
||
else:
|
||
shipment["brand_authorization_info"] = "Информация отсутствует"
|
||
|
||
def _infer_document_replacement_from_sources(self, shipment: Dict, sources: List[Dict]) -> None:
|
||
"""Замена документов: true/false только при явной формулировке в переписке; иначе null."""
|
||
if not isinstance(shipment, dict):
|
||
return
|
||
if shipment.get("document_replacement_needed") is not None:
|
||
return
|
||
blob = self._collect_shipment_source_text(shipment, sources)
|
||
if not isinstance(blob, str) or not blob.strip():
|
||
return
|
||
low = blob.lower()
|
||
if not re.search(
|
||
r"замен\w*\s+документ|переоформл\w*\s+документ|документ\w*\s+к\s+замене|"
|
||
r"document\s+replacement|replace\s+(?:the\s+)?documents|substitut\w*\s+documents",
|
||
low,
|
||
re.IGNORECASE,
|
||
):
|
||
return
|
||
if re.search(
|
||
r"замен\w*\s+не\s+нуж|замен\w*\s+не\s+треб|не\s+нужна\s+замен|"
|
||
r"no\s+document\s+replacement|replacement\s+not\s+required",
|
||
low,
|
||
re.IGNORECASE,
|
||
):
|
||
shipment["document_replacement_needed"] = False
|
||
return
|
||
shipment["document_replacement_needed"] = True
|
||
|
||
@staticmethod
|
||
def _text_describes_parallel_sea_rail_alternatives(text: str) -> bool:
|
||
"""
|
||
Ж/д и море упомянуты как разные варианты маршрута (один ИЛИ другой), а не как одна
|
||
интермодальная цепочка «море + ж/д» в одной поставке.
|
||
Пример: «Рассматривают 2 варианта: прямые рейсы ЖД / прямые суда через ДВ…».
|
||
"""
|
||
if not isinstance(text, str) or not text.strip():
|
||
return False
|
||
t = text.lower().replace("ё", "е")
|
||
# Явно одна мультимодальная цепочка — не отключаем.
|
||
if re.search(
|
||
r"море\s*\+\s*жд|море\s+и\s+ж/д|интермодал|intermodal|"
|
||
r"sea\s*\+\s*rail|sea\s+and\s+rail|"
|
||
r"морск\w*\s+этап.{0,120}ж\/?д\s+этап|ж\/?д\s+этап.{0,120}морск",
|
||
t,
|
||
re.IGNORECASE | re.DOTALL,
|
||
):
|
||
return False
|
||
if re.search(
|
||
r"рассматрива\w*\s+(?:\d+\s+)?варианта|(?:^|[\n•·●◦\-\–—])\s*(?:два|2)\s+варианта",
|
||
t,
|
||
re.IGNORECASE,
|
||
):
|
||
return True
|
||
# «выход поезда/суда» у морской схемы через ДВ — срок одного из плеч, не продукт «море+жд».
|
||
if re.search(r"поезд[аы]?\s*/\s*судна", t) and re.search(
|
||
r"через\s+дв|дальн\w*\s*восток|\bдв\b",
|
||
t,
|
||
re.IGNORECASE,
|
||
):
|
||
return True
|
||
# Две строки-альтернативы: прямой ж/д и отдельно морские суда.
|
||
if re.search(
|
||
r"прям\w+\s+рейс\w*.{0,100}(?:ж/д|\bжд\b)",
|
||
t,
|
||
re.IGNORECASE,
|
||
) and re.search(r"прям\w+\s+суд", t, re.IGNORECASE):
|
||
return True
|
||
return False
|
||
|
||
@staticmethod
|
||
def _text_implies_sea_and_rail_multimodal(text: str) -> bool:
|
||
"""В одной заявке явно и морской, и ж/д этап (не «только море»)."""
|
||
if not isinstance(text, str) or not text.strip():
|
||
return False
|
||
t = text.lower()
|
||
if re.search(
|
||
r"море\s*\+\s*жд|море\s+и\s+ж/д|море\s*[-–]\s*жд|ж/д\s*\+\s*море|море\+жд|"
|
||
r"sea\s*\+\s*rail|sea\s+and\s+rail|sea\s*[-–]\s*rail|"
|
||
r"мультимодал|intermodal|морск\w*\s+и\s+железнодор|порт\w*\s+.*\s+станц",
|
||
t,
|
||
re.IGNORECASE,
|
||
):
|
||
return True
|
||
if re.search(
|
||
r"только\s+море|только\s+морск|sea\s+only|без\s+ж/д|без\s+жд|no\s+rail",
|
||
t,
|
||
re.IGNORECASE,
|
||
):
|
||
return False
|
||
if RAGEngineGemini._text_describes_parallel_sea_rail_alternatives(text):
|
||
return False
|
||
sea = bool(
|
||
re.search(
|
||
r"\bморск|\bморе\b|sea\s+freight|ocean\s+freight|морской\s+порт|"
|
||
r"порт\s+погруз|vessel|судно|суда|судов|морским\s+пут",
|
||
t,
|
||
re.IGNORECASE,
|
||
)
|
||
)
|
||
rail = bool(
|
||
re.search(
|
||
r"ж/д|железнодор|railway|\brail\b|\bжд\b|"
|
||
r"станци|платформ|1520|вагон|container\s+train|по\s+жд|на\s+жд|"
|
||
r"rail\s+transport|жд\s+этап|\sжд\s|[,;]\s*жд\b",
|
||
t,
|
||
re.IGNORECASE,
|
||
)
|
||
)
|
||
return bool(sea and rail)
|
||
|
||
def _correct_multimodal_sea_rail_for_shipment(self, shipment: Dict, combined_text: str) -> None:
|
||
"""Если выбрана чистая морская перевозка, а в тексте явно море+ж/д — тип «Мультимодальная …»."""
|
||
if not isinstance(shipment, dict):
|
||
return
|
||
if not self._text_implies_sea_and_rail_multimodal(combined_text):
|
||
return
|
||
sea_only = frozenset({"Морская перевозка (FCL)", "Морская перевозка (LCL)"})
|
||
|
||
def _pure_sea(nm: str) -> bool:
|
||
return (nm or "").strip() in sea_only
|
||
|
||
st_t = (shipment.get("shipment_type") or "").strip().upper()
|
||
|
||
def _target_multimodal(from_name: str) -> Optional[str]:
|
||
if not _pure_sea(from_name):
|
||
return None
|
||
fcl = "FCL" in from_name
|
||
if st_t == "LCL":
|
||
fcl = False
|
||
elif st_t == "FCL":
|
||
fcl = True
|
||
cand = (
|
||
"Мультимодальная перевозка море + ж/д (FCL)"
|
||
if fcl
|
||
else "Мультимодальная перевозка море + ж/д (LCL)"
|
||
)
|
||
return cand if self._shipping_type_record_by_name(cand) else None
|
||
|
||
raw_req = shipment.get("requested_shipping_type_names")
|
||
if isinstance(raw_req, list) and raw_req:
|
||
new_req: List[str] = []
|
||
changed = False
|
||
for x in raw_req:
|
||
if not isinstance(x, str):
|
||
new_req.append(x)
|
||
continue
|
||
n = x.strip()
|
||
t = _target_multimodal(n)
|
||
if t:
|
||
new_req.append(t)
|
||
changed = True
|
||
else:
|
||
new_req.append(n)
|
||
if changed:
|
||
shipment["requested_shipping_type_names"] = new_req
|
||
return
|
||
|
||
st = (shipment.get("shipping_type") or "").strip()
|
||
t = _target_multimodal(st)
|
||
if t:
|
||
shipment["shipping_type"] = t
|
||
|
||
def _infer_dangerous_goods_from_sources(self, shipment: Dict, sources: List[Dict]) -> None:
|
||
"""Дополняет dangerous_goods по тексту писем/вложений (категории батареи/газы/жидкости/сухой лёд)."""
|
||
if not isinstance(shipment, dict):
|
||
return
|
||
email_blob = self._collect_shipment_source_text(
|
||
shipment, sources, include_email=True, include_attachments=False
|
||
)
|
||
att_blob = self._collect_shipment_source_text(
|
||
shipment, sources, include_email=False, include_attachments=True
|
||
)
|
||
blob = email_blob if email_blob.strip() else att_blob
|
||
if not isinstance(blob, str) or not blob.strip():
|
||
return
|
||
low_email = email_blob.lower() if isinstance(email_blob, str) else ""
|
||
low_att = att_blob.lower() if isinstance(att_blob, str) else ""
|
||
|
||
dg = shipment.get("dangerous_goods")
|
||
if not isinstance(dg, dict):
|
||
dg = {}
|
||
for key in ("batteries", "gases", "liquids", "dry_ice"):
|
||
if key not in dg:
|
||
dg[key] = None
|
||
shipment["dangerous_goods"] = dg
|
||
|
||
def _hit(pattern: str) -> bool:
|
||
# Приоритет письма: сначала ищем в тексте письма, потом во вложениях.
|
||
if low_email and re.search(pattern, low_email, re.IGNORECASE):
|
||
return True
|
||
if low_att and re.search(pattern, low_att, re.IGNORECASE):
|
||
return True
|
||
return False
|
||
|
||
if dg.get("batteries") is None:
|
||
if _hit(
|
||
r"литий|lithium|li\s*[- ]?ion|liion|батаре|аккумулятор|battery|batteries|"
|
||
r"\bcells?\b|\bbutton\s+cell\b|\bpower\s*bank\b|"
|
||
r"\bun\s*348[01]\b|\bun\s*3090\b|\bun\s*3091\b|\bun\s*3171\b",
|
||
):
|
||
dg["batteries"] = True
|
||
|
||
if dg.get("gases") is None:
|
||
if _hit(
|
||
r"газовый\s+баллон|сжиженн\w*\s+газ|gas\s+cylinder|аэрозол|aerosol|"
|
||
r"compressed\s+gas|баллон\s+с\s+газ|\blpg\b|\blng\b|"
|
||
r"\bco2\b|углекисл\w+\s+газ|пропан|бутан|class\s*2\b|класс\s*2\b|"
|
||
r"\bun\s*1950\b|\bun\s*1013\b",
|
||
):
|
||
dg["gases"] = True
|
||
|
||
if dg.get("liquids") is None:
|
||
if _hit(
|
||
r"легковоспламен|воспламен|flammable|combustible\s+liquid|жидкост.*опасн|"
|
||
r"class\s*3\b|класс\s*3\s*опасност|имдг|imdg|"
|
||
r"\bun\s*12\d{2}\b|\bun\s*19\d{2}\b|\bун\s*\d{4}\b|\bun\s*\d{4}\b|"
|
||
r"solvent|растворител|paint|краск|resin|смола|ink|чернил|ethanol|спирт|acetone|ацетон",
|
||
):
|
||
dg["liquids"] = True
|
||
|
||
if dg.get("dry_ice") is None:
|
||
if _hit(r"сухой\s+л[её]д|dry\s+ice|\bun\s*1845\b"):
|
||
dg["dry_ice"] = True
|
||
|
||
# Явное "не опасный" / non-DG — проставляем False по пустым категориям.
|
||
def _negation_scan(pattern_text: str, keywords: dict) -> None:
|
||
"""Проверяет текст на наличие 'не содержит...' и выставляет False для совпавших категорий."""
|
||
neg_match = re.search(r'не\s+содержит\s+(?P<items>[^\.;]+)', pattern_text, re.IGNORECASE)
|
||
if not neg_match:
|
||
return
|
||
items = neg_match.group('items').lower()
|
||
for cat, word_list in keywords.items():
|
||
if any(w in items for w in word_list):
|
||
if dg.get(cat) is None:
|
||
dg[cat] = False
|
||
|
||
dangerous_keywords = {
|
||
"batteries": ["батаре", "аккумулятор", "элемент питания", "batter"],
|
||
"gases": ["газ", "аэрозол", "баллон", "gas", "aerosol"],
|
||
"liquids": ["жидкост", "liquid", "растворител", "краск", "смол", "масл", "чернил"],
|
||
"dry_ice": ["сухого льда", "сухой лёд", "dry ice"]
|
||
}
|
||
|
||
_negation_scan(low_email, dangerous_keywords)
|
||
_negation_scan(low_att, dangerous_keywords)
|
||
|
||
explicit_non_dg = bool(
|
||
(low_email and re.search(
|
||
r"\bnon[\s\-]?dg\b|not\s+dangerous|not\s+hazardous|"
|
||
r"not\s+classified\s+as\s+dangerous|"
|
||
r"без\s+опасн\w+\s+груз|"
|
||
r"не\s+опасн\w+\s+груз|"
|
||
r"груз\s+не\s*опасн\w*|"
|
||
r"груз\s+не\s+явля\w+\s+опасн\w*|"
|
||
r"не\s+явля\w+\s+опасн\w*|"
|
||
r"неопасн\w+\s+груз|"
|
||
r"опасн\w+\s+груз\w*\s+нет|"
|
||
r"не\s+hazmat",
|
||
low_email,
|
||
re.IGNORECASE,
|
||
)) or
|
||
(low_att and re.search(
|
||
r"\bnon[\s\-]?dg\b|not\s+dangerous|not\s+hazardous|"
|
||
r"not\s+classified\s+as\s+dangerous|"
|
||
r"без\s+опасн\w+\s+груз|"
|
||
r"не\s+опасн\w+\s+груз|"
|
||
r"груз\s+не\s*опасн\w*|"
|
||
r"груз\s+не\s+явля\w+\s+опасн\w*|"
|
||
r"не\s+явля\w+\s+опасн\w*|"
|
||
r"неопасн\w+\s+груз|"
|
||
r"опасн\w+\s+груз\w*\s+нет|"
|
||
r"не\s+hazmat",
|
||
low_att,
|
||
re.IGNORECASE,
|
||
))
|
||
)
|
||
if explicit_non_dg:
|
||
for k in ("batteries", "gases", "liquids", "dry_ice"):
|
||
dg[k] = False
|
||
shipment.pop("dangerous_goods_note", None)
|
||
# Явное "не опасный груз" приоритетнее: MSDS и DGM отмечаем как не нужные.
|
||
shipment["msds_required"] = False
|
||
shipment["dgm_report_required"] = False
|
||
|
||
any_true = any(dg.get(k) is True for k in ("batteries", "gases", "liquids", "dry_ice"))
|
||
if any_true:
|
||
# Не затираем уже существующий note, но и не навязываем.
|
||
return
|
||
# Обобщенный индикатор опасности, если категория не распознана.
|
||
generic_dg = bool(
|
||
(low_email and re.search(
|
||
r"\bdg\b|dangerous\s+goods|hazardous\s+(?:goods|material)|hazmat|imdg|"
|
||
r"опасн\w+\s+груз|класс\s+опасност|class\s+[1-9]\b|"
|
||
r"\bun\s*\d{4}\b|ун\s*\d{4}",
|
||
low_email,
|
||
re.IGNORECASE,
|
||
)) or
|
||
(low_att and re.search(
|
||
r"\bdg\b|dangerous\s+goods|hazardous\s+(?:goods|material)|hazmat|imdg|"
|
||
r"опасн\w+\s+груз|класс\s+опасност|class\s+[1-9]\b|"
|
||
r"\bun\s*\d{4}\b|ун\s*\d{4}",
|
||
low_att,
|
||
re.IGNORECASE,
|
||
))
|
||
)
|
||
if generic_dg:
|
||
# По требованию UI: в 9-м пункте показываем только факт Да/Нет/Нет информации.
|
||
# Если есть лишь общий намёк на DG без конкретной категории, не подставляем текстовый note.
|
||
shipment.pop("dangerous_goods_note", None)
|
||
|
||
def _extract_quantities_from_text(self, raw_text: str) -> Dict[str, Any]:
|
||
"""
|
||
Детерминированное извлечение тройки «ctn/ctns + kg + cbm».
|
||
Это нужно, чтобы не брать первое попавшееся значение (иногда в письме несколько частей).
|
||
"""
|
||
if not isinstance(raw_text, str) or not raw_text.strip():
|
||
return {}
|
||
|
||
def _to_float(v: str) -> Optional[float]:
|
||
if not isinstance(v, str):
|
||
return None
|
||
s = v.strip().replace(" ", "")
|
||
s = s.replace(",", ".")
|
||
try:
|
||
return float(s)
|
||
except Exception:
|
||
return None
|
||
|
||
# Триплет в одном месте строки: "... 57ctns ... 673kgs ... 5.07cbm"
|
||
triplet_re = re.compile(
|
||
r"(?P<ctn>\d+(?:[.,]\d+)?)\s*(?:ctns?|carton(?:s)?|boxes?)\b"
|
||
r".{0,80}?"
|
||
r"(?P<kg>\d+(?:[.,]\d+)?)\s*(?:kgs?|kg|кг)\b"
|
||
r".{0,80}?"
|
||
r"(?P<cbm>\d+(?:[.,]\d+)?)\s*(?:cbm|m3|m³|м3|м³)\b",
|
||
re.IGNORECASE | re.DOTALL,
|
||
)
|
||
|
||
triplets: List[tuple[float, float, float]] = []
|
||
for m in triplet_re.finditer(raw_text):
|
||
ctn = _to_float(m.group("ctn"))
|
||
kg = _to_float(m.group("kg"))
|
||
cbm = _to_float(m.group("cbm"))
|
||
if ctn is None or kg is None or cbm is None:
|
||
continue
|
||
triplets.append((ctn, kg, cbm))
|
||
|
||
# Если триплетов мало/нет — пробуем раздельные совпадения.
|
||
ctn_re = re.compile(
|
||
r"(?<!\d)(?P<ctn>\d+(?:[.,]\d+)?)\s*(?:ctns?|carton(?:s)?|boxes?)\b",
|
||
re.IGNORECASE,
|
||
)
|
||
kg_re = re.compile(
|
||
r"(?<!\d)(?P<kg>\d+(?:[.,]\d+)?)\s*(?:kgs?|kg|кг)\b",
|
||
re.IGNORECASE,
|
||
)
|
||
cbm_re = re.compile(
|
||
r"(?<!\d)(?P<cbm>\d+(?:[.,]\d+)?)\s*(?:cbm|m3|m³|м3|м³)\b",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
ctn_matches = [_to_float(m.group("ctn")) for m in ctn_re.finditer(raw_text)]
|
||
kg_matches = [_to_float(m.group("kg")) for m in kg_re.finditer(raw_text)]
|
||
cbm_matches = [_to_float(m.group("cbm")) for m in cbm_re.finditer(raw_text)]
|
||
|
||
ctn_matches = [x for x in ctn_matches if x is not None]
|
||
kg_matches = [x for x in kg_matches if x is not None]
|
||
cbm_matches = [x for x in cbm_matches if x is not None]
|
||
|
||
def _sum_or_none(vals: List[float]) -> Optional[float]:
|
||
if not vals:
|
||
return None
|
||
return float(sum(vals))
|
||
|
||
triplet_sum = None
|
||
if triplets:
|
||
triplet_sum = {
|
||
"package_count": _sum_or_none([t[0] for t in triplets]),
|
||
"total_weight_kg": _sum_or_none([t[1] for t in triplets]),
|
||
"total_volume_cbm": _sum_or_none([t[2] for t in triplets]),
|
||
"triplet_count": len(triplets),
|
||
}
|
||
|
||
separate_sum = {
|
||
"package_count": _sum_or_none(ctn_matches),
|
||
"total_weight_kg": _sum_or_none(kg_matches),
|
||
"total_volume_cbm": _sum_or_none(cbm_matches),
|
||
"ctn_count": len(ctn_matches),
|
||
"kg_count": len(kg_matches),
|
||
"cbm_count": len(cbm_matches),
|
||
}
|
||
|
||
return {
|
||
"triplet_sum": triplet_sum,
|
||
"separate_sum": separate_sum,
|
||
}
|
||
|
||
def _shipment_weight_hints(self, shipment: Dict) -> List[str]:
|
||
"""Ключевые токены для привязки строк веса к конкретной перевозке."""
|
||
if not isinstance(shipment, dict):
|
||
return []
|
||
blob = " ".join(
|
||
str(x or "")
|
||
for x in (
|
||
shipment.get("cargo_description"),
|
||
shipment.get("pickup_address"),
|
||
shipment.get("delivery_address"),
|
||
shipment.get("container_type"),
|
||
shipment.get("hs_code"),
|
||
shipment.get("cargo_value"),
|
||
)
|
||
)
|
||
out: List[str] = []
|
||
# Явные коды партий/заказов.
|
||
for pat in (
|
||
r"\btrocyps[-_\s]?\d+\b",
|
||
r"\b[A-Za-z]{2,}[-_]\d{2,}\b",
|
||
r"\b[A-Za-z0-9]{3,}[-_][A-Za-z0-9]{2,}\b",
|
||
):
|
||
for m in re.findall(pat, blob, flags=re.IGNORECASE):
|
||
tok = str(m).strip().lower()
|
||
if tok and tok not in out:
|
||
out.append(tok)
|
||
# Общие слова тоже берём, но ограниченно.
|
||
stop = {
|
||
"cargo", "weight", "gross", "net", "total", "kg", "kgs", "cbm", "ctn",
|
||
"груз", "вес", "брутто", "нетто", "итого", "всего", "адрес", "доставка",
|
||
}
|
||
for w in re.findall(r"[a-zA-Zа-яА-ЯёЁ0-9]{4,}", blob.lower()):
|
||
if w in stop:
|
||
continue
|
||
if w not in out:
|
||
out.append(w)
|
||
if len(out) >= 24:
|
||
break
|
||
return out
|
||
|
||
def _extract_preferred_weight_kg(self, raw_text: str, shipment: Dict) -> Dict[str, Any]:
|
||
"""
|
||
Извлекает вес с приоритетом:
|
||
1) gross/brutto
|
||
2) нейтральный вес (без указания net/gross)
|
||
3) net
|
||
И старается оставить только строки, относящиеся к текущей перевозке.
|
||
"""
|
||
if not isinstance(raw_text, str) or not raw_text.strip():
|
||
return {"value": None, "count": 0, "priority": "none", "context_matched": False}
|
||
|
||
hints = self._shipment_weight_hints(shipment)
|
||
lines = [ln for ln in raw_text.splitlines() if isinstance(ln, str) and ln.strip()]
|
||
if not lines:
|
||
return {"value": None, "count": 0, "priority": "none", "context_matched": False}
|
||
|
||
num_unit_re = re.compile(
|
||
r"(?P<num>\d+(?:[.,]\d+)?)\s*(?P<unit>kgs?|kg|кг|тонн(?:а|ы)?|тонн|tons?|ton|t)\b",
|
||
re.IGNORECASE,
|
||
)
|
||
gross_re = re.compile(r"\bgross\b|\bg\.?\s*w\.?\b|брутто|brutto", re.IGNORECASE)
|
||
net_re = re.compile(r"\bnet\b|\bn\.?\s*w\.?\b|нетто", re.IGNORECASE)
|
||
total_re = re.compile(r"\btotal\b|итог|всего|grand\s*total|итого", re.IGNORECASE)
|
||
|
||
candidates: List[Dict[str, Any]] = []
|
||
for ln in lines:
|
||
ll = ln.lower()
|
||
if any(x in ll for x in ["контейнер", "фут", "20ft", "40ft", "40hc", "20dc"]):
|
||
continue
|
||
kind = "unknown"
|
||
has_gross = bool(gross_re.search(ll))
|
||
has_net = bool(net_re.search(ll))
|
||
if has_gross and not has_net:
|
||
kind = "gross"
|
||
elif has_net and not has_gross:
|
||
kind = "net"
|
||
|
||
ctx_score = 0
|
||
if hints:
|
||
for h in hints:
|
||
if h and h in ll:
|
||
ctx_score += 1
|
||
|
||
line_has_total = bool(total_re.search(ll))
|
||
for m in num_unit_re.finditer(ln):
|
||
raw_num = (m.group("num") or "").replace(" ", "").replace(",", ".")
|
||
try:
|
||
v = float(raw_num)
|
||
except Exception:
|
||
continue
|
||
unit = (m.group("unit") or "").lower()
|
||
if unit in ("ton", "tons", "t", "тонна", "тонны", "тонн"):
|
||
v *= 1000.0
|
||
# Отсекаем нереалистичные веса.
|
||
if v <= 0 or v > 5_000_000:
|
||
continue
|
||
candidates.append(
|
||
{
|
||
"value": round(v, 6),
|
||
"kind": kind,
|
||
"ctx": ctx_score,
|
||
"is_total": line_has_total,
|
||
"line": re.sub(r"\s+", " ", ln.strip())[:280],
|
||
}
|
||
)
|
||
|
||
if not candidates:
|
||
return {"value": None, "count": 0, "priority": "none", "context_matched": False}
|
||
|
||
# Если есть строки, привязанные к контексту текущей перевозки, берём только их.
|
||
context_matched = False
|
||
if hints:
|
||
max_ctx = max(int(c["ctx"]) for c in candidates)
|
||
if max_ctx > 0:
|
||
candidates = [c for c in candidates if int(c["ctx"]) == max_ctx]
|
||
context_matched = True
|
||
|
||
# Приоритет типа веса: gross > unknown > net.
|
||
for bucket_name in ("gross", "unknown", "net"):
|
||
bucket = [c for c in candidates if c["kind"] == bucket_name]
|
||
if not bucket:
|
||
continue
|
||
|
||
# Строки с явным словом "итого/total"
|
||
with_total = [c for c in bucket if c["is_total"]]
|
||
|
||
# Для группы "unknown" без явных total: суммируем все веса,
|
||
# так как они, скорее всего, относятся к отдельным товарным позициям.
|
||
# Контекстные подсказки игнорируем, чтобы не потерять часть позиций.
|
||
if bucket_name == "unknown" and not with_total:
|
||
chosen = bucket # все подходящие строки
|
||
context_matched = False # сигнал, что контекст не привязан к одной партии
|
||
else:
|
||
chosen = with_total if with_total else bucket
|
||
|
||
# Оставляем только записи, где упоминается "kg" или "кг" (если такие есть)
|
||
kg_vals = [c for c in chosen if "kg" in c["line"].lower() or "кг" in c["line"].lower()]
|
||
if kg_vals:
|
||
chosen = kg_vals
|
||
|
||
vals: List[float] = []
|
||
seen: set[tuple[float, str]] = set()
|
||
for c in chosen:
|
||
key = (round(float(c["value"]), 3), str(c["line"]).lower())
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
vals.append(float(c["value"]))
|
||
|
||
if not vals:
|
||
continue
|
||
return {
|
||
"value": float(sum(vals)),
|
||
"count": len(vals),
|
||
"priority": bucket_name,
|
||
"context_matched": context_matched,
|
||
}
|
||
def _line_looks_vehicle_size_context(self, line: str) -> bool:
|
||
"""Строка про габариты транспорта/машины — не смешивать с габаритами груза."""
|
||
ll = line.lower()
|
||
keys = (
|
||
"габарит машин",
|
||
"габариты машин",
|
||
"габарит тс",
|
||
"габариты тс",
|
||
"габарит транспорт",
|
||
"габариты транспорт",
|
||
"размер фуры",
|
||
"длина фуры",
|
||
"длина тента",
|
||
"габарит кузов",
|
||
"габариты кузов",
|
||
"габарит прицеп",
|
||
"габариты прицеп",
|
||
"габарит полуприцеп",
|
||
"vehicle dimension",
|
||
"truck dimension",
|
||
"dimensions of truck",
|
||
"размер автопоезд",
|
||
"высота тс",
|
||
"длина тс",
|
||
"ширина тс",
|
||
)
|
||
return any(k in ll for k in keys)
|
||
|
||
def _extract_dimensions_from_text(self, raw_text: str) -> List[Dict[str, float]]:
|
||
"""
|
||
Извлекает все уникальные Д×Ш×В (длина/ширина/высота) из таблиц/строк.
|
||
Конвертирует мм->см по эвристике (если значения выглядят как мм).
|
||
"""
|
||
if not isinstance(raw_text, str) or not raw_text.strip():
|
||
return []
|
||
|
||
def _to_float(v: str) -> Optional[float]:
|
||
if not isinstance(v, str):
|
||
return None
|
||
s = v.strip().replace(" ", "")
|
||
s = s.replace(",", ".")
|
||
try:
|
||
return float(s)
|
||
except Exception:
|
||
return None
|
||
|
||
# Тройка чисел через x/×/X; опционально мм/см сразу после третьего числа
|
||
triple_re = re.compile(
|
||
r"(?P<a>\d+(?:[.,]\d+)?)\s*(?:x|×|X)\s*(?P<b>\d+(?:[.,]\d+)?)\s*(?:x|×|X)\s*(?P<c>\d+(?:[.,]\d+)?)"
|
||
r"\s*(?P<u>мм|mm|см|cm)?(?!\d)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
lines = raw_text.splitlines()
|
||
|
||
def _looks_like_carton(line: str) -> bool:
|
||
ll = line.lower()
|
||
return any(k in ll for k in ["carton size", "carton", "габарит", "размер", "size", "size/ctn"])
|
||
|
||
candidates = [ln for ln in lines if _looks_like_carton(ln)]
|
||
if not candidates:
|
||
candidates = lines
|
||
|
||
dims: List[Dict[str, float]] = []
|
||
seen: set[tuple[float, float, float]] = set()
|
||
|
||
for ln in candidates:
|
||
if self._line_looks_vehicle_size_context(ln):
|
||
continue
|
||
line_unit = _line_hint_dimension_unit(ln)
|
||
for m in triple_re.finditer(ln):
|
||
a = _to_float(m.group("a"))
|
||
b = _to_float(m.group("b"))
|
||
c = _to_float(m.group("c"))
|
||
if a is None or b is None or c is None:
|
||
continue
|
||
|
||
u_raw = (m.group("u") or "").strip().lower()
|
||
explicit: Optional[str] = None
|
||
if u_raw in ("мм", "mm"):
|
||
explicit = "mm"
|
||
elif u_raw in ("см", "cm"):
|
||
explicit = "cm"
|
||
elif u_raw in ("m", "м"):
|
||
explicit = "m"
|
||
if explicit is None:
|
||
explicit = line_unit
|
||
|
||
l_cm, w_cm, h_cm = infer_cargo_triple_raw_to_cm(a, b, c, explicit_unit=explicit)
|
||
|
||
# Плаузибл-диапазон для габаритов единичной тары (в см).
|
||
if not (1 <= l_cm <= 200 and 1 <= w_cm <= 200 and 1 <= h_cm <= 200):
|
||
continue
|
||
|
||
l_cm = round(l_cm, 4)
|
||
w_cm = round(w_cm, 4)
|
||
h_cm = round(h_cm, 4)
|
||
key = (round(l_cm, 1), round(w_cm, 1), round(h_cm, 1))
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
dims.append({"length_cm": l_cm, "width_cm": w_cm, "height_cm": h_cm})
|
||
|
||
return dims
|
||
|
||
def _extract_vehicle_dimensions_from_text(self, raw_text: str) -> List[Dict[str, float]]:
|
||
"""Тройки размеров только из строк с контекстом ТС (фура, кузов, машина)."""
|
||
if not isinstance(raw_text, str) or not raw_text.strip():
|
||
return []
|
||
|
||
def _to_float(v: str) -> Optional[float]:
|
||
if not isinstance(v, str):
|
||
return None
|
||
s = v.strip().replace(" ", "").replace(",", ".")
|
||
try:
|
||
return float(s)
|
||
except Exception:
|
||
return None
|
||
|
||
triple_re = re.compile(
|
||
r"(?P<a>\d+(?:[.,]\d+)?)\s*(?:x|×|X)\s*(?P<b>\d+(?:[.,]\d+)?)\s*(?:x|×|X)\s*(?P<c>\d+(?:[.,]\d+)?)(?!\d)",
|
||
re.IGNORECASE,
|
||
)
|
||
lines = raw_text.splitlines()
|
||
out: List[Dict[str, float]] = []
|
||
seen: set[tuple[float, float, float]] = set()
|
||
|
||
for ln in lines:
|
||
if not self._line_looks_vehicle_size_context(ln):
|
||
continue
|
||
ll = ln.lower()
|
||
meters_hint = bool(re.search(r"\bм\b|meter|metre|\bm\s", ll))
|
||
for m in triple_re.finditer(ln):
|
||
a = _to_float(m.group("a"))
|
||
b = _to_float(m.group("b"))
|
||
c = _to_float(m.group("c"))
|
||
if a is None or b is None or c is None:
|
||
continue
|
||
vals = [a, b, c]
|
||
if meters_hint or (max(vals) <= 30 and min(vals) >= 0.25):
|
||
vals = [v * 100.0 for v in vals]
|
||
else:
|
||
unit_mm = max(vals) > 250
|
||
if unit_mm:
|
||
vals = [v / 10.0 for v in vals]
|
||
l_cm, w_cm, h_cm = vals[0], vals[1], vals[2]
|
||
if not (50 <= l_cm <= 3000 and 50 <= w_cm <= 400 and 50 <= h_cm <= 500):
|
||
continue
|
||
l_cm = round(l_cm, 4)
|
||
w_cm = round(w_cm, 4)
|
||
h_cm = round(h_cm, 4)
|
||
key = (round(l_cm, 1), round(w_cm, 1), round(h_cm, 1))
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
out.append({"length_cm": l_cm, "width_cm": w_cm, "height_cm": h_cm})
|
||
return out
|
||
|
||
def _postprocess_quantities_and_dimensions(
|
||
self,
|
||
shipment: Dict,
|
||
sources: List[Dict],
|
||
) -> None:
|
||
"""
|
||
Детерминированное усиление:
|
||
- пересчитать package_count/weight/volume по множественным упоминаниям (не брать первое)
|
||
- расширить dimensions всеми вариантами размеров
|
||
- применить эвристики отрицаний для булевых полей
|
||
"""
|
||
if not isinstance(shipment, dict):
|
||
return
|
||
|
||
raw_text = self._collect_shipment_source_text(shipment, sources)
|
||
if not raw_text.strip():
|
||
return
|
||
|
||
quantities = self._extract_quantities_from_text(raw_text)
|
||
triplet_sum = quantities.get("triplet_sum") or None
|
||
separate_sum = quantities.get("separate_sum") or {}
|
||
preferred_weight = self._extract_preferred_weight_kg(raw_text, shipment)
|
||
|
||
total_shipments = getattr(self, "_postprocess_total_shipments", None)
|
||
single_shipment_mode = isinstance(total_shipments, int) and total_shipments == 1
|
||
|
||
# Применить эвристики отрицаний для булевых полей
|
||
def _apply_boolean_negations(shipment: Dict, email_text: str):
|
||
"""Если в тексте явно указано отрицание для булева поля, проставляем False."""
|
||
if not email_text or not isinstance(email_text, str):
|
||
return
|
||
low = email_text.lower()
|
||
|
||
# Явный non-DG: фиксируем все категории как "нет", а MSDS/DGM как "не нужен".
|
||
if re.search(
|
||
r"\bnon[\s\-]?dg\b|not\s+dangerous|not\s+hazardous|not\s+classified\s+as\s+dangerous|"
|
||
r"без\s+опасн\w+\s+груз|не\s+опасн\w+\s+груз|груз\s+не\s*опасн\w*|"
|
||
r"груз\s+не\s+явля\w+\s+опасн\w*|не\s+явля\w+\s+опасн\w*|неопасн\w+\s+груз|"
|
||
r"опасн\w+\s+груз\w*\s+нет|не\s+hazmat",
|
||
low,
|
||
re.IGNORECASE,
|
||
):
|
||
dg = shipment.get("dangerous_goods")
|
||
if not isinstance(dg, dict):
|
||
dg = {}
|
||
for k in ("batteries", "gases", "liquids", "dry_ice"):
|
||
dg[k] = False
|
||
shipment["dangerous_goods"] = dg
|
||
shipment["msds_required"] = False
|
||
shipment["dgm_report_required"] = False
|
||
shipment.pop("dangerous_goods_note", None)
|
||
|
||
# MSDS
|
||
if shipment.get("msds_required") is None:
|
||
if re.search(r"msds\s+не\s+(нуж|треб)|не\s+(нуж|треб)\w*\s+msds|паспорт\s+безопасн\w*\s+не\s+(нуж|треб)", low):
|
||
shipment["msds_required"] = False
|
||
# DGM
|
||
if shipment.get("dgm_report_required") is None:
|
||
if re.search(r"dgm\s+не\s+(нуж|треб)|не\s+(нуж|треб)\w*\s+dgm|декларац\w+\s+на\s+опасн\w*\s+не\s+(нуж|треб)", low):
|
||
shipment["dgm_report_required"] = False
|
||
# Замена документов (уже есть отдельный метод, но можно дополнить)
|
||
if shipment.get("document_replacement_needed") is None:
|
||
if re.search(r"замен\w*\s+документ\w*\s+не\s+(нуж|треб)|не\s+(нуж|треб)\w*\s+замен\w+\s+документ", low):
|
||
shipment["document_replacement_needed"] = False
|
||
# Авторизационное письмо бренда
|
||
if shipment.get("brand_authorization_letter") is None:
|
||
if re.search(r"авторизацион\w+\s+письм\w*\s+не\s+(нуж|треб)|не\s+(нуж|треб)\w*\s+авторизацион\w+\s+письм", low):
|
||
shipment["brand_authorization_letter"] = False
|
||
|
||
_apply_boolean_negations(shipment, raw_text)
|
||
|
||
# Далее идёт оригинальный код: определения _parse_existing_num, _maybe_int
|
||
|
||
def _parse_existing_num(v: Any) -> Optional[float]:
|
||
if v is None:
|
||
return None
|
||
if isinstance(v, (int, float)):
|
||
return float(v)
|
||
if isinstance(v, str):
|
||
s = v.strip().replace(" ", "").replace(",", ".")
|
||
try:
|
||
return float(s)
|
||
except Exception:
|
||
return None
|
||
return None
|
||
|
||
def _maybe_int(v: Any) -> Any:
|
||
if v is None:
|
||
return None
|
||
if isinstance(v, (int, float)):
|
||
vf = float(v)
|
||
if abs(vf - round(vf)) < 1e-6:
|
||
return int(round(vf))
|
||
return vf
|
||
return v
|
||
|
||
# 1) Если найдено >=2 триплетов — считаем, что есть несколько явных частей.
|
||
if isinstance(triplet_sum, dict) and triplet_sum.get("triplet_count", 0) >= 2:
|
||
shipment["package_count"] = _maybe_int(triplet_sum.get("package_count"))
|
||
# Вес агрессивно пересчитываем только при одной перевозке.
|
||
if single_shipment_mode:
|
||
if preferred_weight.get("value") is not None:
|
||
shipment["total_weight_kg"] = float(preferred_weight.get("value"))
|
||
else:
|
||
shipment["total_weight_kg"] = triplet_sum.get("total_weight_kg")
|
||
shipment["total_volume_cbm"] = triplet_sum.get("total_volume_cbm")
|
||
else:
|
||
# 2) Иначе — точечно обновляем только если по отдельным полям есть много совпадений,
|
||
# а текущее значение меньше вычисленного.
|
||
for field, key_count in [
|
||
("package_count", "ctn_count"),
|
||
("total_volume_cbm", "cbm_count"),
|
||
]:
|
||
computed = separate_sum.get(field)
|
||
match_count = separate_sum.get(key_count, 0) if isinstance(separate_sum, dict) else 0
|
||
if computed is None or match_count < 2:
|
||
continue
|
||
|
||
existing = _parse_existing_num(shipment.get(field))
|
||
if existing is None or (existing is not None and existing < computed * 0.99):
|
||
shipment[field] = _maybe_int(computed) if field == "package_count" else computed
|
||
|
||
# Вес обновляем отдельно ТОЛЬКО при одной перевозке:
|
||
# при нескольких партиях в треде веса часто уже разнесены по строкам/таблицам.
|
||
if single_shipment_mode:
|
||
# 2а) Сумма отдельных kg-совпадений (если их много) как fallback.
|
||
kg_sum = separate_sum.get("total_weight_kg") if isinstance(separate_sum, dict) else None
|
||
kg_count = int((separate_sum or {}).get("kg_count", 0)) if isinstance(separate_sum, dict) else 0
|
||
if kg_sum is not None and kg_count >= 2:
|
||
existing_w = _parse_existing_num(shipment.get("total_weight_kg"))
|
||
if existing_w is None or (existing_w is not None and existing_w < float(kg_sum) * 0.99):
|
||
shipment["total_weight_kg"] = float(kg_sum)
|
||
|
||
# 2б) Предпочтительный вес (gross/контекст) приоритетнее fallback.
|
||
w = preferred_weight.get("value")
|
||
w_cnt = int(preferred_weight.get("count") or 0)
|
||
if w is not None and w_cnt >= 1:
|
||
existing_w = _parse_existing_num(shipment.get("total_weight_kg"))
|
||
# Не занижаем уже найденный общий вес одиночным "предпочтительным" значением
|
||
# (часто это вес одной палеты/позиции без маркера total).
|
||
should_apply = (
|
||
existing_w is None
|
||
or float(w) >= float(existing_w) * 0.99
|
||
or w_cnt >= 2
|
||
)
|
||
if should_apply and (
|
||
existing_w is None or abs(existing_w - float(w)) > max(1.0, float(w) * 0.01)
|
||
):
|
||
shipment["total_weight_kg"] = float(w)
|
||
else:
|
||
# Для нескольких перевозок обновляем вес только при явной контекстной привязке
|
||
# к конкретной партии (код/токены shipment в строке веса).
|
||
w = preferred_weight.get("value")
|
||
w_cnt = int(preferred_weight.get("count") or 0)
|
||
ctx_ok = bool(preferred_weight.get("context_matched"))
|
||
if ctx_ok and w is not None and w_cnt >= 1:
|
||
existing_w = _parse_existing_num(shipment.get("total_weight_kg"))
|
||
if existing_w is None or abs(existing_w - float(w)) > max(1.0, float(w) * 0.01):
|
||
shipment["total_weight_kg"] = float(w)
|
||
|
||
# 3) Габариты: расширяем dimensions всеми извлеченными вариантами.
|
||
dims = self._extract_dimensions_from_text(raw_text)
|
||
if dims:
|
||
existing_dims = shipment.get("dimensions")
|
||
if not isinstance(existing_dims, list):
|
||
existing_dims = []
|
||
merged = existing_dims + dims
|
||
|
||
# Дедупликация (грубая) перед normalize_shipment.
|
||
seen_key: set[tuple[float, float, float]] = set()
|
||
out: List[Dict[str, float]] = []
|
||
for d in merged:
|
||
if not isinstance(d, dict):
|
||
continue
|
||
l = d.get("length_cm")
|
||
w = d.get("width_cm")
|
||
h = d.get("height_cm")
|
||
if l is None or w is None or h is None:
|
||
# нормализация размеров (и конвертация *_mm) произойдёт в normalize_shipment
|
||
out.append(d)
|
||
continue
|
||
try:
|
||
l_f = float(str(l).replace(" ", "").replace(",", "."))
|
||
w_f = float(str(w).replace(" ", "").replace(",", "."))
|
||
h_f = float(str(h).replace(" ", "").replace(",", "."))
|
||
except Exception:
|
||
out.append(d)
|
||
continue
|
||
key = (round(l_f, 1), round(w_f, 1), round(h_f, 1))
|
||
if key in seen_key:
|
||
continue
|
||
seen_key.add(key)
|
||
out.append({"length_cm": l_f, "width_cm": w_f, "height_cm": h_f})
|
||
|
||
shipment["dimensions"] = out
|
||
|
||
vdims = self._extract_vehicle_dimensions_from_text(raw_text)
|
||
if vdims:
|
||
existing_v = shipment.get("vehicle_dimensions")
|
||
if not isinstance(existing_v, list):
|
||
existing_v = []
|
||
merged_v = existing_v + vdims
|
||
seen_v: set[tuple[float, float, float]] = set()
|
||
out_v: List[Dict[str, float]] = []
|
||
for d in merged_v:
|
||
if not isinstance(d, dict):
|
||
continue
|
||
l = d.get("length_cm")
|
||
w = d.get("width_cm")
|
||
h = d.get("height_cm")
|
||
if l is None or w is None or h is None:
|
||
out_v.append(d)
|
||
continue
|
||
try:
|
||
l_f = float(str(l).replace(" ", "").replace(",", "."))
|
||
w_f = float(str(w).replace(" ", "").replace(",", "."))
|
||
h_f = float(str(h).replace(" ", "").replace(",", "."))
|
||
except Exception:
|
||
out_v.append(d)
|
||
continue
|
||
key = (round(l_f, 1), round(w_f, 1), round(h_f, 1))
|
||
if key in seen_v:
|
||
continue
|
||
seen_v.add(key)
|
||
out_v.append({"length_cm": l_f, "width_cm": w_f, "height_cm": h_f})
|
||
shipment["vehicle_dimensions"] = out_v
|
||
|
||
def _enrich_operator_document_services(self, shipment: Dict, sources: List[Dict]) -> None:
|
||
"""
|
||
Если в письме явно указано, что документ не предоставляет клиент/контрагент,
|
||
добавляем в additional_services строку о подготовке документа силами оператора.
|
||
"""
|
||
if not isinstance(shipment, dict):
|
||
return
|
||
raw_text = self._collect_shipment_source_text(shipment, sources)
|
||
if not raw_text.strip():
|
||
return
|
||
text_l = raw_text.lower()
|
||
|
||
docs = shipment.get("documents_found", {}) or {}
|
||
required_missing = {
|
||
"msds": shipment.get("msds_required") is True and not (docs.get("msds") or []),
|
||
"dgm": shipment.get("dgm_report_required") is True and not (docs.get("dgm") or []),
|
||
"brand_authorization": shipment.get("brand_authorization_letter") is True and not (docs.get("brand_authorization") or []),
|
||
}
|
||
|
||
# Отдельно учитываем формулировки "документ запрошен", это НЕ "не предоставят".
|
||
msds_requested = bool(re.search(
|
||
r"\bmsds\b.{0,24}\b(запрош\w*|requested|request)\b|\b(запрош\w*|requested|request)\b.{0,24}\bmsds\b",
|
||
text_l,
|
||
re.IGNORECASE,
|
||
))
|
||
|
||
actors = r"(клиент|заказчик|покупател\w*|контрагент\w*|поставщик\w*|отправител\w*|грузоотправител\w*)"
|
||
neg_verbs = r"(не\s+(?:предостав\w*|высыла\w*|пришл\w*|прикладыва\w*|имеет\w*|в\s+наличии))"
|
||
doc_words = {
|
||
"msds": r"(?:\bmsds\b|паспорт\w*\s+безопасност\w*|material\s+safety\s+data\s+sheet|safety\s+data\s+sheet)",
|
||
"dgm": r"(?:\bdgm\b|dangerous\s+goods|декларац\w*\s+на\s+опасн|imdg)",
|
||
"brand_authorization": r"(?:авторизацион\w*\s+письм\w*|authorization\s+letter|authorisation\s+letter|письм\w*\s+бренд)",
|
||
}
|
||
|
||
def _doc_not_provided(doc_key: str) -> bool:
|
||
dpat = doc_words[doc_key]
|
||
# "поставщик не предоставляет <документ>" или "<документ> не предоставляет(ся)"
|
||
p1 = rf"{actors}\b.{{0,45}}?{neg_verbs}\b.{{0,45}}?{dpat}"
|
||
p2 = rf"{dpat}.{{0,45}}?{neg_verbs}"
|
||
return bool(re.search(p1, text_l, re.IGNORECASE) or re.search(p2, text_l, re.IGNORECASE))
|
||
|
||
doc_not_provided = {
|
||
"msds": _doc_not_provided("msds"),
|
||
"dgm": _doc_not_provided("dgm"),
|
||
"brand_authorization": _doc_not_provided("brand_authorization"),
|
||
}
|
||
|
||
# Защита от ложного срабатывания: "MSDS запрошен" без отрицания.
|
||
if msds_requested and not doc_not_provided["msds"]:
|
||
required_missing["msds"] = False
|
||
|
||
# Добавляем пункты в additional_services только если одновременно:
|
||
# (1) по данным перевозки этот документ нужен и не приложен, и
|
||
# (2) в тексте писем явно сказано, что КОНКРЕТНО ЭТОТ документ не предоставляет клиент/поставщик.
|
||
# Раньше использовалась общая фраза «документы не предоставляются» — из‑за неё подтягивались
|
||
# и MSDS, и DGM, и письмо бренда разом; отдельно DGM не должен включать строку про MSDS.
|
||
additions: List[str] = []
|
||
if required_missing["msds"] and doc_not_provided["msds"]:
|
||
additions.append(
|
||
"Получение/подготовка MSDS силами оператора (по письму документ не предоставляется клиентом/поставщиком)"
|
||
)
|
||
if required_missing["dgm"] and doc_not_provided["dgm"]:
|
||
additions.append(
|
||
"Подготовка DGM / декларации на опасный груз силами оператора (по письму документ не от клиента/поставщика)"
|
||
)
|
||
if required_missing["brand_authorization"] and doc_not_provided["brand_authorization"]:
|
||
additions.append(
|
||
"Получение авторизационного письма бренда силами оператора (по письму документ не от клиента/поставщика)"
|
||
)
|
||
|
||
if not additions:
|
||
return
|
||
|
||
extras = shipment.get("additional_services")
|
||
if not isinstance(extras, list):
|
||
extras = []
|
||
seen = {str(x).strip().lower() for x in extras if str(x).strip()}
|
||
for line in additions:
|
||
key = line.strip().lower()
|
||
if key not in seen:
|
||
extras.append(line)
|
||
seen.add(key)
|
||
shipment["additional_services"] = extras
|
||
|
||
def get_attachment(self, session_id: str, email_index: int, attachment_index: int) -> Optional[Dict]:
|
||
"""Возвращает информацию о вложении включая оригинальное содержимое"""
|
||
if session_id not in self.sessions:
|
||
return None
|
||
emails = self.sessions[session_id]
|
||
if email_index < 0 or email_index >= len(emails):
|
||
return None
|
||
attachments = emails[email_index].get("attachments", [])
|
||
if attachment_index < 0 or attachment_index >= len(attachments):
|
||
return None
|
||
return attachments[attachment_index]
|
||
|
||
def _extract_email_content(self, email: Dict) -> tuple:
|
||
parts = [
|
||
f"От: {email.get('senderName', '')} ({email.get('sender', '')})",
|
||
f"Тема: {email.get('subject', '')}",
|
||
f"Кому: {email.get('to', '')}",
|
||
f"Дата: {email.get('receivedTime', '')}",
|
||
f"Содержание: {email.get('body', '')}"
|
||
]
|
||
attachments_list = []
|
||
|
||
attachments = email.get('attachments', [])
|
||
if attachments:
|
||
parts.append("Вложения:")
|
||
for att in attachments:
|
||
filename = att.get('filename', 'unknown')
|
||
size = att.get('size', 0)
|
||
content_b64 = att.get('content')
|
||
att_text = ""
|
||
|
||
if content_b64:
|
||
try:
|
||
file_content = base64.b64decode(content_b64)
|
||
extracted_text = self.doc_processor.extract_text(file_content, filename)
|
||
att_text = extracted_text if isinstance(extracted_text, str) else ""
|
||
if att_text.strip():
|
||
parts.append(f"- {filename} ({size} байт):\n{att_text}")
|
||
else:
|
||
parts.append(
|
||
f"- {filename} ({size} байт): (извлекаемого текста нет — например изображение; "
|
||
f"учитывайте имя файла и тело письма)"
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"Failed to extract text from attachment {filename}: {e}")
|
||
parts.append(f"- {filename} ({size} байт) — ошибка извлечения текста")
|
||
else:
|
||
parts.append(f"- {filename} ({size} байт) — содержимое не передано")
|
||
|
||
attachments_list.append(
|
||
{"filename": filename, "text": att_text, "size": size, "content_base64": content_b64}
|
||
)
|
||
|
||
return "\n".join(parts), attachments_list
|
||
|
||
def _two_pass_preflight_enabled(self) -> bool:
|
||
"""Второй проход: RAG_TWO_PASS_LLM=0|false|off отключает."""
|
||
raw = os.getenv("RAG_TWO_PASS_LLM", "1").strip().lower()
|
||
if raw in ("", "1", "true", "yes", "on", "enable", "enabled"):
|
||
return True
|
||
if raw in ("0", "false", "no", "off", "disable", "disabled"):
|
||
return False
|
||
return True
|
||
|
||
def _run_preflight_classification(
|
||
self,
|
||
context_text: str,
|
||
query_text: str,
|
||
type_names: List[str],
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""
|
||
Проход 1: короткая классификация (режим FCL/LCL, модальности, подсказка shipping_type).
|
||
Результат вставляется во второй проход как orientирующая подсказка, не как истина.
|
||
"""
|
||
if not (context_text and str(context_text).strip()) or context_text.strip() == "Нет писем для анализа.":
|
||
return None
|
||
max_ch = int(os.getenv("RAG_PREFLIGHT_MAX_CHARS", "48000"))
|
||
ctx = context_text if len(context_text) <= max_ch else (
|
||
context_text[:max_ch] + f"\n... [обрезано для preflight, RAG_PREFLIGHT_MAX_CHARS={max_ch}]"
|
||
)
|
||
model = os.getenv("RAG_PREFLIGHT_MODEL", "").strip() or "card_generation"
|
||
max_tok = int(os.getenv("RAG_PREFLIGHT_MAX_TOKENS", "1200"))
|
||
|
||
system_prompt = """Ты классификатор логистической переписки. Верни ТОЛЬКО JSON-объект без markdown и лишнего текста.
|
||
|
||
Верни только эти поля:
|
||
- load_mode: FCL | LCL | road_LTL | road_FTL | air | not_container | unknown
|
||
- multimodal_sea_rail: true/false (true только для одной связанной цепочки море+ж/д; альтернативы "море ИЛИ ж/д" = false)
|
||
- shipping_type_suggestion: ровно одно имя из переданного списка допустимых типов или ""
|
||
- shipment_type_if_container: "FCL" | "LCL" | ""
|
||
- indicative_all_modes: true/false (true, если явно просят индикатив по нескольким модальностям сразу)
|
||
|
||
Правила:
|
||
- Не придумывай факты.
|
||
- При сомнении: load_mode=unknown, shipping_type_suggestion="", shipment_type_if_container="", confidence="low".
|
||
- FCL/LCL — только контейнерная морская/жд тема; FTL/LTL — только автодорога.
|
||
- Если multimodal_sea_rail=true, shipping_type_suggestion должна быть мультимодальной (а не чисто морской/жд)."""
|
||
|
||
names_lines = "\n".join(str(n) for n in type_names if n)
|
||
user_content = (
|
||
"Допустимые значения shipping_type_suggestion (ровно одна строка из списка ниже):\n"
|
||
+ names_lines
|
||
+ "\n\n--- ТЕКСТ ПИСЕМ ---\n"
|
||
+ ctx
|
||
+ "\n\n---\nДополнительный запрос аналитика: "
|
||
+ (query_text or "")
|
||
)
|
||
|
||
try:
|
||
response = self.openai_client.chat.completions.create(
|
||
model=model,
|
||
messages=[
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": user_content},
|
||
],
|
||
temperature=0,
|
||
max_tokens=max_tok,
|
||
)
|
||
raw = (response.choices[0].message.content or "").strip()
|
||
parsed = self._parse_json_response(raw)
|
||
if not isinstance(parsed, dict):
|
||
return None
|
||
# нормализация имени типа
|
||
sug = parsed.get("shipping_type_suggestion")
|
||
if isinstance(sug, str) and sug.strip() and type_names:
|
||
if sug.strip() not in type_names:
|
||
# мягкое сопоставление без переопределения списка
|
||
logger.warning(
|
||
"Preflight shipping_type_suggestion %r не из списка — обнуляем",
|
||
sug.strip(),
|
||
)
|
||
parsed["shipping_type_suggestion"] = ""
|
||
return parsed
|
||
except Exception as e:
|
||
logger.warning("Preflight classification failed: %s", e, exc_info=True)
|
||
return None
|
||
|
||
def _select_shipping_types_for_prompt(
|
||
self,
|
||
context_text: str,
|
||
query_text: str,
|
||
preflight: Optional[Dict[str, Any]],
|
||
) -> List[Dict]:
|
||
"""
|
||
Уменьшает блок типов перевозок в промпте: оставляет top-K по совпадению keywords с текстом
|
||
(письма + запрос), плюс тип из preflight.shipping_type_suggestion если есть.
|
||
Если совпадений нет или запрошены все модальности — возвращает полный список.
|
||
"""
|
||
k = _shipping_types_prompt_k()
|
||
all_types = [st for st in self.shipping_types if isinstance(st, dict)]
|
||
if k <= 0 or not all_types:
|
||
return all_types
|
||
if preflight and preflight.get("indicative_all_modes") is True:
|
||
logger.info("RAG_SHIPPING_TYPES_PROMPT_K: indicative_all_modes — все типы в промпте")
|
||
return all_types
|
||
|
||
text = f"{context_text}\n{query_text or ''}".lower()
|
||
scored: List[tuple[int, str, Dict]] = []
|
||
for st in all_types:
|
||
name = str(st.get("name") or "")
|
||
score = 0
|
||
for kw in st.get("keywords") or []:
|
||
if not isinstance(kw, str):
|
||
continue
|
||
kws = kw.strip().lower()
|
||
if len(kws) >= 2 and kws in text:
|
||
score += 1
|
||
scored.append((score, name, st))
|
||
scored.sort(key=lambda x: (-x[0], x[1]))
|
||
|
||
if not scored or scored[0][0] == 0:
|
||
logger.info(
|
||
"RAG_SHIPPING_TYPES_PROMPT_K=%d: нет совпадений keywords — все %d типов в промпте",
|
||
k,
|
||
len(all_types),
|
||
)
|
||
return all_types
|
||
|
||
selected: List[Dict] = []
|
||
picked: set[str] = set()
|
||
sug = (preflight or {}).get("shipping_type_suggestion")
|
||
if isinstance(sug, str) and sug.strip():
|
||
for sc, nm, st in scored:
|
||
if nm == sug.strip():
|
||
selected.append(st)
|
||
picked.add(nm)
|
||
break
|
||
for sc, nm, st in scored:
|
||
if len(selected) >= k:
|
||
break
|
||
if nm in picked:
|
||
continue
|
||
selected.append(st)
|
||
picked.add(nm)
|
||
|
||
logger.info(
|
||
"RAG_SHIPPING_TYPES_PROMPT_K=%d: в промпте %d из %d типов перевозок",
|
||
k,
|
||
len(selected),
|
||
len(all_types),
|
||
)
|
||
return selected
|
||
|
||
def _collect_session_sources(self, session_id: Optional[str]) -> tuple[str, List[Dict]]:
|
||
"""Собирает context_text и sources для сессии (тот же формат, что и для LLM)."""
|
||
if session_id and session_id in self.sessions:
|
||
emails = self.sessions[session_id]
|
||
logger.info(f"Session {session_id} has {len(emails)} emails")
|
||
email_texts = []
|
||
sources: List[Dict] = []
|
||
for idx, email in enumerate(emails, 1):
|
||
email_id = f"{session_id}_{idx}"
|
||
email_texts.append(f"ID письма: {email_id}\n{email['content']}")
|
||
sources.append({
|
||
"id": email_id,
|
||
**email["metadata"],
|
||
"content": email["content"],
|
||
"attachments": email.get("attachments", [])
|
||
})
|
||
context_text = "\n\n---\n\n".join(email_texts)
|
||
logger.info(f"Context preview: {context_text[:300]}...")
|
||
return context_text, sources
|
||
if session_id:
|
||
logger.warning(f"Session {session_id} not found")
|
||
return "Нет писем для анализа.", []
|
||
|
||
def _session_content_fingerprint(self, session_id: str) -> Optional[str]:
|
||
"""
|
||
Отпечаток набора писем в сессии: id письма + хэш текста (включая вложения в content).
|
||
Меняется при правке shipping_types.json (mtime) и RAG_CACHE_BUST.
|
||
"""
|
||
if session_id not in self.sessions:
|
||
return None
|
||
emails = self.sessions[session_id]
|
||
parts: List[str] = []
|
||
for email in emails:
|
||
meta = email.get("metadata") or {}
|
||
eid = str(meta.get("email_id") or "")
|
||
content = email.get("content") or ""
|
||
h = hashlib.sha256(content.encode("utf-8", errors="replace")).hexdigest()
|
||
parts.append(f"{eid}:{h}")
|
||
parts.sort()
|
||
st_path = resolve_shipping_types_path()
|
||
st_tag = ""
|
||
if st_path and os.path.isfile(st_path):
|
||
st_tag = str(int(os.path.getmtime(st_path)))
|
||
code_tags: List[str] = []
|
||
for p in (
|
||
__file__,
|
||
os.path.join(_SHIPPING_DIR, "document_processor.py"),
|
||
os.path.join(_SHIPPING_DIR, "document_processor(1).py"),
|
||
):
|
||
try:
|
||
if p and os.path.isfile(p):
|
||
code_tags.append(f"{os.path.basename(p)}:{int(os.path.getmtime(p))}")
|
||
except Exception:
|
||
continue
|
||
bust = _report_cache_bust_token()
|
||
raw = (
|
||
f"sid:{session_id}\n"
|
||
+ "\n".join(parts)
|
||
+ "\nST_mtime:"
|
||
+ st_tag
|
||
+ "\nCODE:"
|
||
+ "|".join(code_tags)
|
||
+ "\nBUST:"
|
||
+ bust
|
||
)
|
||
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
|
||
|
||
def _report_cache_file_path(self, session_id: str, query_text: str) -> Optional[str]:
|
||
fp = self._session_content_fingerprint(session_id)
|
||
if not fp:
|
||
return None
|
||
key = hashlib.sha256((fp + "\n" + (query_text or "")).encode("utf-8")).hexdigest()
|
||
return os.path.join(_report_cache_dir(), f"{key}.json")
|
||
|
||
def _try_load_cached_report(self, session_id: Optional[str], query_text: str) -> Optional[Dict]:
|
||
if not session_id or not _report_cache_enabled():
|
||
return None
|
||
path = self._report_cache_file_path(session_id, query_text)
|
||
if not path or not os.path.isfile(path):
|
||
return None
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
payload = json.load(f)
|
||
except Exception as e:
|
||
logger.warning("Cargo report cache read failed: %s", e)
|
||
return None
|
||
if payload.get("version") != 1:
|
||
return None
|
||
_, sources = self._collect_session_sources(session_id)
|
||
logger.info(
|
||
"Cargo report cache HIT (session=%s, emails=%d)",
|
||
session_id,
|
||
len(sources),
|
||
)
|
||
return self._normalize_api_report_payload({
|
||
"answer": payload.get("answer") or "",
|
||
"structured_data": payload.get("structured_data") or {},
|
||
"sources": sources,
|
||
"total_emails_analyzed": len(sources),
|
||
})
|
||
|
||
def _save_cached_report(self, session_id: Optional[str], query_text: str, result: Dict) -> None:
|
||
if not session_id or not _report_cache_enabled():
|
||
return
|
||
path = self._report_cache_file_path(session_id, query_text)
|
||
if not path:
|
||
return
|
||
payload = {
|
||
"version": 1,
|
||
"answer": result.get("answer"),
|
||
"structured_data": result.get("structured_data"),
|
||
"total_emails_analyzed": result.get("total_emails_analyzed"),
|
||
}
|
||
try:
|
||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
json.dump(payload, f, ensure_ascii=False)
|
||
logger.info("Cargo report cache saved: %s", path)
|
||
except Exception as e:
|
||
logger.warning("Cargo report cache write failed: %s", e)
|
||
|
||
def _normalize_api_report_payload(self, result: Optional[Dict]) -> Dict:
|
||
"""Единый формат ответа для UI: structured_data.shipments — всегда список словарей."""
|
||
if not isinstance(result, dict):
|
||
return {
|
||
"answer": "",
|
||
"structured_data": {"shipments": []},
|
||
"sources": [],
|
||
"total_emails_analyzed": 0,
|
||
}
|
||
sd = result.get("structured_data")
|
||
if not isinstance(sd, dict):
|
||
result["structured_data"] = self._ensure_valid_structure({})
|
||
else:
|
||
result["structured_data"] = self._ensure_valid_structure(sd)
|
||
sd2 = result.get("structured_data")
|
||
if (
|
||
isinstance(sd2, dict)
|
||
and isinstance(sd2.get("shipments"), list)
|
||
and len(sd2["shipments"]) > 0
|
||
):
|
||
sd2.pop("parse_error", None)
|
||
sd2.pop("raw_preview", None)
|
||
if not isinstance(result.get("sources"), list):
|
||
result["sources"] = []
|
||
te = result.get("total_emails_analyzed")
|
||
if not isinstance(te, int):
|
||
try:
|
||
result["total_emails_analyzed"] = int(te) if te is not None else len(result["sources"])
|
||
except (TypeError, ValueError):
|
||
result["total_emails_analyzed"] = len(result["sources"])
|
||
return result
|
||
|
||
def _read_learning_rows(self) -> List[Dict[str, Any]]:
|
||
path = _learning_store_path()
|
||
if not os.path.isfile(path):
|
||
return []
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
except Exception as e:
|
||
logger.warning("Cargo learning store read failed: %s", e)
|
||
return []
|
||
rows = data.get("examples") if isinstance(data, dict) else None
|
||
if not isinstance(rows, list):
|
||
return []
|
||
return [r for r in rows if isinstance(r, dict)]
|
||
|
||
def _write_learning_rows(self, rows: List[Dict[str, Any]]) -> None:
|
||
path = _learning_store_path()
|
||
try:
|
||
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
json.dump({"version": 1, "examples": rows}, f, ensure_ascii=False, indent=2)
|
||
except Exception as e:
|
||
logger.warning("Cargo learning store write failed: %s", e)
|
||
|
||
def _learning_few_shot_system_fragment(self, context_text: str) -> str:
|
||
if not _learning_few_shot_enabled():
|
||
return ""
|
||
n = _learning_few_shot_count()
|
||
if n <= 0:
|
||
return ""
|
||
rows = self._read_learning_rows()
|
||
if not rows:
|
||
return ""
|
||
ctx_words = _context_word_set((context_text or "")[:12000])
|
||
scored: List[tuple] = []
|
||
for r in rows:
|
||
prev = r.get("context_preview") or ""
|
||
ew = _context_word_set(prev[:12000])
|
||
score = len(ctx_words & ew) if ctx_words else 0
|
||
scored.append((score, str(r.get("created_at") or ""), r))
|
||
scored.sort(key=lambda t: (-t[0], t[1]))
|
||
picked = [t[2] for t in scored[:n] if t[0] > 0]
|
||
if len(picked) < n:
|
||
recent = sorted(rows, key=lambda x: str(x.get("created_at") or ""), reverse=True)
|
||
for r in recent:
|
||
if r not in picked:
|
||
picked.append(r)
|
||
if len(picked) >= n:
|
||
break
|
||
blocks: List[str] = []
|
||
cap = 3500
|
||
for i, ex in enumerate(picked[:n], start=1):
|
||
blob = ex.get("structured_data_compact")
|
||
if not isinstance(blob, dict):
|
||
continue
|
||
js = _json_prompt_compact(blob)
|
||
if len(js) > cap:
|
||
js = js[:cap] + "…"
|
||
blocks.append(
|
||
f"Пример {i} (фрагмент эталонного JSON по похожей переписке):\n{js}"
|
||
)
|
||
if not blocks:
|
||
return ""
|
||
return (
|
||
"\n\nОПЫТ ПО ПРОШЛЫМ РАЗБОРАМ (ориентир по формату и полям, не копируй данные если в текущих письмах иначе):\n"
|
||
+ "\n\n".join(blocks)
|
||
+ "\n\n"
|
||
)
|
||
|
||
def _learning_dedupe_hash(self, context_text: str, structured_data: Dict) -> str:
|
||
sd = structured_data if isinstance(structured_data, dict) else {}
|
||
ship = sd.get("shipments")
|
||
try:
|
||
payload = json.dumps(ship, sort_keys=True, ensure_ascii=False, default=str)
|
||
except Exception:
|
||
payload = str(ship)
|
||
raw = (context_text or "")[:8000] + "\n" + payload
|
||
return hashlib.sha256(raw.encode("utf-8", errors="replace")).hexdigest()
|
||
|
||
def record_cargo_learning(
|
||
self,
|
||
*,
|
||
structured_data: Dict,
|
||
context_preview: Optional[str] = None,
|
||
session_id: Optional[str] = None,
|
||
notes: Optional[str] = None,
|
||
) -> bool:
|
||
"""
|
||
Сохраняет пару «фрагмент переписки → структурированный ответ» для few-shot обучения.
|
||
Можно вызывать после ручной правки JSON в UI (передайте исправленный structured_data).
|
||
"""
|
||
if not isinstance(structured_data, dict):
|
||
return False
|
||
sd = self._ensure_valid_structure(structured_data.copy())
|
||
ctx = context_preview if isinstance(context_preview, str) else ""
|
||
if not ctx.strip() and session_id and session_id in self.sessions:
|
||
ct, _ = self._collect_session_sources(session_id)
|
||
ctx = ct
|
||
ctx = (ctx or "")[:24000]
|
||
if len(ctx.strip()) < 80 and not sd.get("shipments"):
|
||
logger.info("Cargo learning: skip (no context and empty shipments)")
|
||
return False
|
||
h = self._learning_dedupe_hash(ctx, sd)
|
||
rows = self._read_learning_rows()
|
||
if any(isinstance(r, dict) and r.get("dedupe_hash") == h for r in rows):
|
||
logger.info("Cargo learning: duplicate hash, skip")
|
||
return False
|
||
context_text, sources = self._collect_session_sources(session_id)
|
||
|
||
cleaned_emails = _clean_emails_for_learning(sources)
|
||
|
||
row = {
|
||
"input": {
|
||
"emails": cleaned_emails,
|
||
"context_text": context_text
|
||
},
|
||
"output": {
|
||
"structured_data": structured_data,
|
||
"client_letter": structured_data.get("generated_letter")
|
||
},
|
||
"meta": {
|
||
"session_id": session_id,
|
||
"created_at": datetime.now().isoformat(),
|
||
"quality": "auto"
|
||
}
|
||
}
|
||
rows.append(row)
|
||
max_n = _learning_max_store()
|
||
if len(rows) > max_n:
|
||
rows = rows[-max_n:]
|
||
self._write_learning_rows(rows)
|
||
logger.info("Cargo learning: stored example (%d total)", len(rows))
|
||
return True
|
||
|
||
def _maybe_auto_record_learning(
|
||
self, session_id: Optional[str], context_text: str, result: Dict
|
||
) -> None:
|
||
if not _learning_auto_enabled():
|
||
return
|
||
sd = result.get("structured_data")
|
||
if not isinstance(sd, dict):
|
||
return
|
||
if sd.get("parse_error") and not sd.get("shipments"):
|
||
return
|
||
try:
|
||
self.record_cargo_learning(
|
||
structured_data=sd,
|
||
context_preview=context_text,
|
||
session_id=session_id,
|
||
notes="auto_after_analysis",
|
||
)
|
||
except Exception as e:
|
||
logger.warning("Cargo learning auto-record failed: %s", e)
|
||
|
||
async def query_cargo_info(self, query_text: str, session_id: Optional[str] = None, top_k: int = 10) -> Dict:
|
||
context_text, sources = self._collect_session_sources(session_id)
|
||
|
||
cached = self._try_load_cached_report(session_id, query_text)
|
||
if cached is not None:
|
||
return cached
|
||
|
||
if _email_thread_dedupe_enabled():
|
||
context_text = _dedupe_email_thread_paragraphs(context_text)
|
||
|
||
max_ctx = _max_llm_context_chars()
|
||
if max_ctx > 0 and len(context_text) > max_ctx:
|
||
context_text = (
|
||
context_text[:max_ctx]
|
||
+ f"... [текст обрезан: RAG_MAX_CONTEXT_CHARS={max_ctx}]"
|
||
)
|
||
|
||
type_names = [st.get("name") for st in self.shipping_types if st.get("name")]
|
||
preflight: Optional[Dict[str, Any]] = None
|
||
if self._two_pass_preflight_enabled():
|
||
preflight = self._run_preflight_classification(
|
||
context_text, query_text or "", type_names
|
||
)
|
||
if preflight:
|
||
logger.info("Preflight LLM pass completed (keys=%s)", list(preflight.keys()))
|
||
|
||
crit_max = _criteria_preview_chars()
|
||
man_max = _mandatory_counterparty_chars()
|
||
types_for_prompt = self._select_shipping_types_for_prompt(
|
||
context_text, query_text or "", preflight
|
||
)
|
||
shipping_types_info = []
|
||
for st in types_for_prompt:
|
||
shipping_types_info.append({
|
||
"name": st.get("name"),
|
||
"keywords": st.get("keywords", []) or [],
|
||
"criteria_preview": _limit_prompt_field(st.get("criteria") or "", crit_max),
|
||
"mandatory_counterparty_criteria": _limit_prompt_field(
|
||
st.get("mandatory_counterparty_criteria") or "", man_max
|
||
),
|
||
})
|
||
shipping_types_json_compact = _json_prompt_compact(shipping_types_info)
|
||
|
||
preflight_block = ""
|
||
if preflight:
|
||
preflight_block = (
|
||
"\n\nПРЕДВАРИТЕЛЬНЫЙ ПРОХОД 1 — КЛАССИФИКАЦИЯ (отдельный короткий запрос к LLM до полного извлечения; ориентир, "
|
||
"проверь по фактам в письмах ниже):\n"
|
||
+ _json_prompt_compact(preflight)
|
||
+ "\n\nИспользование: учти load_mode, shipment_type_if_container и shipping_type_suggestion "
|
||
"при заполнении shipment_type и shipping_type. "
|
||
"Если письма или вложения противоречат этому блоку — приоритет у текста писем/вложений. "
|
||
"При confidence=low или пустом shipping_type_suggestion не заполняй shipping_type без явных признаков.\n"
|
||
)
|
||
|
||
container_iso_ref_block = iso_reference_prompt_block()
|
||
if container_iso_ref_block:
|
||
container_iso_ref_block = "\n\n" + container_iso_ref_block + "\n"
|
||
|
||
learning_fragment = self._learning_few_shot_system_fragment(context_text)
|
||
|
||
system_prompt = f"""Ты - эксперт по логистике и грузоперевозкам. Твоя задача - проанализировать предоставленные письма и извлечь ВСЮ доступную информацию о грузоперевозках в строгом JSON-формате.
|
||
Твоя задача — максимально точно извлечь данные из писем и вложений.
|
||
{learning_fragment}
|
||
КЛЮЧЕВЫЕ ПРАВИЛА:
|
||
- Не додумывай факты. Если данных нет: null для чисел/неизвестных булевых, "" для строк, [] для списков.
|
||
- Приоритет источников: основной текст письма > вложения > подписи/дисклеймеры (игнорировать).
|
||
- Несколько shipments делай только при явном разделении на независимые партии/маршруты; иначе один shipment.
|
||
- Не выводи рассуждения.
|
||
|
||
PRE-FLIGHT:
|
||
- Блок preflight ниже — только подсказка, не источник истины.
|
||
- Если preflight противоречит письмам/вложениям, приоритет у писем/вложений.
|
||
- Используй preflight только как ориентир для `shipping_type` и `shipment_type`, без дублирования его вывода.
|
||
{preflight_block}
|
||
|
||
ИСПОЛЬЗУЙ ТОЛЬКО ЭТИ ТИПЫ:
|
||
{shipping_types_json_compact}
|
||
|
||
ВЫБОР ТИПОВ ПЕРЕВОЗКИ:
|
||
- `requested_shipping_type_names`: массив точных `name` из списка выше, которые явно запрошены/сравниваются для этой партии.
|
||
- Один однозначный способ -> массив из одного имени.
|
||
- Если клиент просит все/любой варианты -> перечисли все релевантные имена из списка.
|
||
- Если явных признаков способа нет -> `requested_shipping_type_names` = [] и `shipping_type` = "".
|
||
- `shipping_type` заполняй только когда в `requested_shipping_type_names` ровно один элемент, иначе "".
|
||
- Мультимодальность море+ж/д ставь только для одной связанной цепочки маршрута; варианты "море ИЛИ ж/д" — это не мультимодальность.
|
||
- FCL/LCL — контейнерная тема море/жд; FTL/LTL — автодорога.
|
||
|
||
СОГЛАСОВАНИЕ ПОЛЕЙ:
|
||
- `shipment_type`: только "FCL" | "LCL" | ""; при явном (FCL)/(LCL) в `shipping_type` согласуй значение.
|
||
- `container_type` — для контейнерной море/жд темы; `vehicle_type` и `vehicle_dimensions` — для автодороги.
|
||
- `shipping_options` заполняй только явно упомянутыми вариантами, без выдуманных цен/сроков.
|
||
- `document_replacement_needed`: true/false только по явной формулировке, иначе null.
|
||
- В `dangerous_goods` ставь true/false только при явном подтверждении/отрицании, иначе null.
|
||
- Если в письме явно сказано, что груз НЕ опасный (non-DG / not dangerous / не опасный), проставь `dangerous_goods.batteries/gases/liquids/dry_ice = false`, а также `msds_required = false` и `dgm_report_required = false`.
|
||
- Если в письме/вложениях есть габариты палет/коробок/грузовых мест — ОБЯЗАТЕЛЬНО заполни `dimensions` даже если также указаны контейнеры/машины/вес.
|
||
{container_iso_ref_block}
|
||
|
||
ТРЕБУЕМЫЙ ФОРМАТ ОТВЕТА (ТОЛЬКО JSON):
|
||
{{
|
||
"shipments": [
|
||
{{
|
||
"ID_emails": ["список ID писем, откуда взята информация"],
|
||
"client_name": "название компании-клиента",
|
||
"incoterms": "условия поставки Incoterms",
|
||
"cargo_ready_date": "дата(ы) готовности груза к отгрузке: одна строка или массив строк; если для разных мест/партий указаны разные даты — перечисли все (позже отобразятся через запятую)",
|
||
"pickup_address": "адрес забора груза",
|
||
"cargo_value": "стоимость груза с валютой",
|
||
"package_count": 10,
|
||
"total_weight_kg": 150.5,
|
||
"dimensions": [
|
||
{{"length_cm": 100, "width_cm": 80, "height_cm": 60, "dimension_unit": "cm"}}
|
||
],
|
||
"vehicle_dimensions": [
|
||
{{"length_cm": 1360, "width_cm": 245, "height_cm": 270}}
|
||
],
|
||
"total_volume_cbm": 2.5,
|
||
"cargo_description": "описание груза",
|
||
"hs_code": "код ТН ВЭД",
|
||
"dangerous_goods": {{
|
||
"batteries": false,
|
||
"gases": false,
|
||
"liquids": false,
|
||
"dry_ice": false
|
||
}},
|
||
"stackable_with_others": true,
|
||
"stackable_among_themselves": true,
|
||
"msds_required": false,
|
||
"batteries_packed_separately": false,
|
||
"dgm_report_required": false,
|
||
"brand_name": "название бренда",
|
||
"brand_authorization_letter": true,
|
||
"document_replacement_needed": null,
|
||
"transshipment_with_third_country": false,
|
||
"exporter_has_export_license": true,
|
||
"additional_services": ["переупаковка", "маркировка"],
|
||
"arrival_expediting_responsibility": "получатель",
|
||
"delivery_address": "адрес доставки",
|
||
"special_transport_requirements": "требования к транспорту (пропуск и т.п., не габариты ТС)",
|
||
"shipment_type": "FCL или LCL или \"\" — только при явном указании для морской/ж/д контейнерной перевозки",
|
||
"container_type": "количество и типоразмер ISO-контейнеров (как в критерии «Количество и типоразмер контейнеров»), если указано",
|
||
"vehicle_type": "тип автотранспорта для автодороги, если указано",
|
||
"temperature_range": "температурный режим, если указан",
|
||
"customs_clearance_required": true,
|
||
"customs_clearance_place_export_rf": "место оформления",
|
||
"fumigation_on_wooden_packaging": true,
|
||
"requested_shipping_type_names": ["ТОЧНЫЕ name из списка типов: все запрошенные/релевантные для этой партии; один элемент если способ один; все типы из списка если клиент просит любой/все варианты"],
|
||
"shipping_type": "одно имя из списка если requested_shipping_type_names длины 1, иначе \"\"",
|
||
"shipping_options": [
|
||
{{
|
||
"mode": "air|sea|road|rail",
|
||
"cost": "3500 USD",
|
||
"transit_time": "3-5 дней",
|
||
"details": "прямой рейс, с таможенным оформлением"
|
||
}}
|
||
]
|
||
}}
|
||
]
|
||
}}
|
||
ПРАВИЛА ЗАПОЛНЕНИЯ:
|
||
Если информация отсутствует — используй null для чисел и для булевых полей (не ставь false, если факт неизвестен), пустую строку "" для текста, пустой массив [] для списков. Значение false для булевых — только если в письме явно указано отрицание.
|
||
document_replacement_needed — только по явным фразам про замену документов; иначе null.
|
||
Обязательно заполни requested_shipping_type_names по правилам блока «ВЫБОР ТИПОВ ПЕРЕВОЗКИ»; shipping_type — дублирование при одном типе, иначе "".
|
||
В ID_emails укажи список ID писем, из которых взята информация для этой перевозки
|
||
cargo_ready_date — дата готовности груза (ready date, ETD от производителя, «готов к отгрузке с …»): строка или массив строк; не выдумывай. Несколько дат для разных складов/партий — все в массиве или через запятую в одной строке.
|
||
dimensions — массив объектов, по одному на каждое грузовое место; поля length_cm, width_cm, height_cm — в сантиметрах. Если в письме явно указаны мм (или числа вида 1200×800×600 без «см»), переведи в см или укажи в объекте dimension_unit: \"cm\" | \"mm\" | \"m\" (мм и м код потом приведёт к см).
|
||
Если габариты указаны в packing list/аттаче (carton size / pallet size / dimensions), они также обязательны к переносу в `dimensions`.
|
||
vehicle_dimensions — только габариты транспортного средства (авто), не груза; [] если не указаны
|
||
shipment_type — "FCL"/"LCL" или "" по правилам выше (не путать с типом перевозки shipping_type)
|
||
shipping_options — только варианты, явно упомянутые или описанные в письмах/вложениях (цены, сроки — только если указаны). Если в тексте нет вариантов доставки — [] (пустой массив). Не придумывай стоимость, сроки и маршруты.
|
||
Контекст из писем для анализа:
|
||
{context_text}
|
||
Верни ТОЛЬКО валидный JSON-объект, начинающийся с {{ и заканчивающийся }}. Никакого дополнительного текста."""
|
||
|
||
try:
|
||
response = self.openai_client.chat.completions.create(
|
||
model="card_generation",
|
||
#model="anthropic/claude-3.5-sonnet",
|
||
messages=[
|
||
{"role": "system", "content": system_prompt},
|
||
{"role": "user", "content": query_text}
|
||
],
|
||
temperature=0,
|
||
max_tokens=4000
|
||
)
|
||
answer_text = response.choices[0].message.content
|
||
logger.info(f"Model response received, length: {len(answer_text) if answer_text else 0}")
|
||
except Exception as e:
|
||
logger.error(f"Error calling OpenAI API: {e}", exc_info=True)
|
||
raise
|
||
|
||
structured_data = self._parse_json_response(answer_text)
|
||
documents_found = self.detect_documents(sources)
|
||
for shipment in structured_data.get("shipments", []):
|
||
shipment["documents_found"] = documents_found
|
||
# Если MSDS-документ реально найден во вложениях, усиливаем три-state:
|
||
# не переопределяем явное "msds_required = False", но если это None/пусто — ставим True.
|
||
if isinstance(documents_found, dict):
|
||
msds_docs = documents_found.get("msds") or []
|
||
if msds_docs and shipment.get("msds_required") is not False:
|
||
shipment["msds_required"] = True
|
||
structured_data = self._ensure_valid_structure(structured_data)
|
||
structured_data["shipments"] = self._merge_shipments_same_destination(
|
||
structured_data.get("shipments", []), context_text
|
||
)
|
||
structured_data["shipments"] = self._collapse_shipments_single_source(
|
||
structured_data.get("shipments", []), sources
|
||
)
|
||
shipments = structured_data.get("shipments", [])
|
||
for s in shipments:
|
||
if not isinstance(s, dict):
|
||
continue
|
||
if not (isinstance(s.get("container_type"), str) and s.get("container_type").strip()):
|
||
inferred_container = self._infer_container_type_from_sources(s, sources)
|
||
if inferred_container:
|
||
s["container_type"] = inferred_container
|
||
|
||
for s in shipments:
|
||
if not isinstance(s, dict):
|
||
continue
|
||
self._infer_brand_and_authorization_from_sources(s, sources)
|
||
self._infer_document_replacement_from_sources(s, sources)
|
||
self._infer_dangerous_goods_from_sources(s, sources)
|
||
self._enrich_special_requirements(s, sources)
|
||
|
||
# Детерминированно усиливаем числовые поля и габариты по тексту писем/вложений,
|
||
# чтобы не брать "первое попавшееся" и чтобы размеры извлекались по всем вариантам.
|
||
total_shipments = len(shipments) if isinstance(shipments, list) else 0
|
||
self._postprocess_total_shipments = total_shipments
|
||
for s in shipments:
|
||
self._postprocess_quantities_and_dimensions(s, sources)
|
||
self._postprocess_total_shipments = None
|
||
|
||
for s in shipments:
|
||
self.normalize_shipment(s, shipments)
|
||
self._enrich_operator_document_services(s, sources)
|
||
structured_data = self._enrich_with_shipping_types(structured_data, context_text, sources)
|
||
|
||
if preflight:
|
||
structured_data["preflight_classification"] = preflight
|
||
|
||
out = {
|
||
"answer": answer_text,
|
||
"structured_data": structured_data,
|
||
"sources": sources,
|
||
"total_emails_analyzed": len(sources)
|
||
}
|
||
self._save_cached_report(session_id, query_text, out)
|
||
self._maybe_auto_record_learning(session_id, context_text, out)
|
||
return self._normalize_api_report_payload(out)
|
||
|
||
def _parse_json_response(self, answer_text: str) -> Dict:
|
||
try:
|
||
text = answer_text.strip() if answer_text else ""
|
||
if not text:
|
||
raise ValueError("Empty response from model")
|
||
|
||
if "```" in text:
|
||
pattern = r'```(?:json)?\s*(.*?)```'
|
||
matches = re.findall(pattern, text, re.DOTALL)
|
||
if matches:
|
||
text = matches[0].strip()
|
||
else:
|
||
text = text.replace("```json", "").replace("```", "").strip()
|
||
|
||
start_idx = -1
|
||
for i, char in enumerate(text):
|
||
if char in '{[':
|
||
start_idx = i
|
||
break
|
||
|
||
if start_idx == -1:
|
||
raise ValueError("No JSON object or array found in response")
|
||
|
||
balance = 0
|
||
in_string = False
|
||
escape = False
|
||
opener = text[start_idx]
|
||
closer = '}' if opener == '{' else ']'
|
||
end_idx = start_idx
|
||
|
||
for i in range(start_idx, len(text)):
|
||
char = text[i]
|
||
|
||
if escape:
|
||
escape = False
|
||
continue
|
||
if char == '\\' and in_string:
|
||
escape = True
|
||
continue
|
||
|
||
if char == '"' and not escape:
|
||
in_string = not in_string
|
||
continue
|
||
|
||
if in_string:
|
||
continue
|
||
|
||
if char == opener:
|
||
balance += 1
|
||
elif char == closer:
|
||
balance -= 1
|
||
if balance == 0:
|
||
end_idx = i + 1
|
||
break
|
||
|
||
json_str = text[start_idx:end_idx].strip()
|
||
|
||
if not json_str:
|
||
raise ValueError("Extracted JSON string is empty")
|
||
|
||
result = json.loads(json_str)
|
||
result = _normalize_dict_keys(result)
|
||
if isinstance(result, list):
|
||
result = {"shipments": result}
|
||
|
||
return result
|
||
|
||
except json.JSONDecodeError as e:
|
||
logger.warning(f"JSON decode error: {e}")
|
||
logger.debug(f"Failed JSON preview: {answer_text[:500] if answer_text else 'None'}...")
|
||
return {"shipments": [], "parse_error": str(e), "raw_preview": answer_text[:300] if answer_text else ""}
|
||
except Exception as e:
|
||
logger.warning(f"Failed to parse JSON: {e}", exc_info=True)
|
||
return {"shipments": [], "parse_error": str(e), "raw_preview": answer_text[:300] if answer_text else ""}
|
||
|
||
def _ensure_valid_structure(self, data: Any) -> Dict:
|
||
if isinstance(data, str):
|
||
logger.error(f"Expected dict but got string: {data[:100]}")
|
||
return {"shipments": [], "error": "Invalid response format: string instead of object"}
|
||
|
||
if not isinstance(data, dict):
|
||
logger.warning(f"Expected dict but got {type(data)}, creating empty structure")
|
||
return {"shipments": []}
|
||
|
||
if "shipments" not in data:
|
||
data["shipments"] = []
|
||
|
||
sh_raw = data.get("shipments")
|
||
if isinstance(sh_raw, dict):
|
||
vals = list(sh_raw.values())
|
||
if vals and all(isinstance(x, dict) for x in vals):
|
||
data["shipments"] = vals
|
||
else:
|
||
data["shipments"] = []
|
||
elif not isinstance(sh_raw, list):
|
||
data["shipments"] = []
|
||
|
||
data["shipments"] = [s for s in data["shipments"] if isinstance(s, dict)]
|
||
|
||
for shipment in data["shipments"]:
|
||
if "ID_emails" not in shipment:
|
||
shipment["ID_emails"] = []
|
||
if "shipping_options" not in shipment:
|
||
shipment["shipping_options"] = []
|
||
if "dangerous_goods" not in shipment:
|
||
shipment["dangerous_goods"] = {}
|
||
if "dimensions" not in shipment:
|
||
shipment["dimensions"] = []
|
||
if "vehicle_dimensions" not in shipment:
|
||
shipment["vehicle_dimensions"] = []
|
||
if "shipping_type_candidates" not in shipment:
|
||
shipment["shipping_type_candidates"] = []
|
||
if "requested_shipping_type_names" not in shipment:
|
||
shipment["requested_shipping_type_names"] = []
|
||
crd = shipment.get("cargo_ready_date")
|
||
if isinstance(crd, list):
|
||
shipment["cargo_ready_date"] = ", ".join(
|
||
str(x).strip() for x in crd if x is not None and str(x).strip()
|
||
)
|
||
elif crd is not None and not isinstance(crd, str):
|
||
s = str(crd).strip()
|
||
shipment["cargo_ready_date"] = s if s else ""
|
||
|
||
return data
|
||
|
||
def _normalize_shipping_type_name_key(self, name: str) -> str:
|
||
"""Мягкая нормализация имени типа перевозки для устойчивого сопоставления."""
|
||
if not isinstance(name, str):
|
||
return ""
|
||
n = name.strip().lower()
|
||
if not n:
|
||
return ""
|
||
# Приводим похожие символы и удаляем пунктуацию/пробелы.
|
||
n = n.replace("ё", "е")
|
||
n = n.replace(" + ", "+")
|
||
n = re.sub(r"[\"'`]", "", n)
|
||
n = re.sub(r"[\s\-\u2013\u2014_/.,;:()]+", "", n)
|
||
return n
|
||
|
||
def _shipping_type_record_by_name(self, name: str) -> Optional[Dict]:
|
||
n = (name or "").strip()
|
||
if not n:
|
||
return None
|
||
for st in self.shipping_types:
|
||
if isinstance(st, dict) and st.get("name") == n:
|
||
return st
|
||
key = self._normalize_shipping_type_name_key(n)
|
||
if not key:
|
||
return None
|
||
for st in self.shipping_types:
|
||
if not isinstance(st, dict):
|
||
continue
|
||
st_name = st.get("name")
|
||
if not isinstance(st_name, str):
|
||
continue
|
||
if self._normalize_shipping_type_name_key(st_name) == key:
|
||
return st
|
||
return None
|
||
|
||
def _normalize_requested_shipping_type_names(self, raw: Any) -> List[str]:
|
||
"""Имена типов из ответа LLM: только известные из конфига, порядок сохраняется, без дублей."""
|
||
if raw is None:
|
||
return []
|
||
items: List[Any]
|
||
if isinstance(raw, str):
|
||
items = [raw]
|
||
elif isinstance(raw, list):
|
||
items = raw
|
||
else:
|
||
return []
|
||
known = {
|
||
st.get("name")
|
||
for st in self.shipping_types
|
||
if isinstance(st, dict) and isinstance(st.get("name"), str) and st.get("name")
|
||
}
|
||
known_by_key: Dict[str, str] = {}
|
||
for st_name in known:
|
||
k = self._normalize_shipping_type_name_key(st_name)
|
||
if k and k not in known_by_key:
|
||
known_by_key[k] = st_name
|
||
out: List[str] = []
|
||
seen: set[str] = set()
|
||
for x in items:
|
||
if not isinstance(x, str):
|
||
continue
|
||
n = x.strip()
|
||
if not n:
|
||
continue
|
||
resolved = n
|
||
if resolved not in known:
|
||
nk = self._normalize_shipping_type_name_key(resolved)
|
||
mapped = known_by_key.get(nk) if nk else None
|
||
if mapped:
|
||
resolved = mapped
|
||
if resolved not in known:
|
||
if n:
|
||
logger.warning("requested_shipping_type_names: неизвестное имя %r — пропуск", n)
|
||
continue
|
||
if resolved not in seen:
|
||
seen.add(resolved)
|
||
out.append(resolved)
|
||
return out
|
||
|
||
def _build_candidates_from_type_names(self, names: List[str]) -> List[Dict]:
|
||
out: List[Dict] = []
|
||
for n in names:
|
||
rec = self._shipping_type_record_by_name(n)
|
||
if not rec:
|
||
continue
|
||
cai = rec.get("criteria_ai") or rec.get("criteria") or ""
|
||
out.append(
|
||
{
|
||
"shipping_type": rec.get("name") or n,
|
||
"match_score": 1,
|
||
"criteria": str(cai),
|
||
}
|
||
)
|
||
return out
|
||
|
||
def _enrich_with_shipping_types(
|
||
self, structured_data: Dict, context_text: str = "", sources: Optional[List[Dict]] = None
|
||
) -> Dict:
|
||
if not self.shipping_types:
|
||
logger.warning("No shipping types loaded, skipping enrichment")
|
||
return structured_data
|
||
|
||
shipments = structured_data.get('shipments', [])
|
||
|
||
skip_keys_for_match = frozenset({
|
||
"shipping_type",
|
||
"shipping_type_candidates",
|
||
"requested_shipping_type_names",
|
||
"criteria",
|
||
"criteria_preview",
|
||
"documents_found",
|
||
})
|
||
|
||
for shipment in shipments:
|
||
text_parts: List[str] = []
|
||
|
||
for key, value in shipment.items():
|
||
if key in skip_keys_for_match:
|
||
continue
|
||
if isinstance(value, str) and value:
|
||
text_parts.append(value.lower())
|
||
elif isinstance(value, dict):
|
||
for v in value.values():
|
||
if isinstance(v, str):
|
||
text_parts.append(v.lower())
|
||
elif isinstance(value, list):
|
||
for item in value:
|
||
if isinstance(item, str):
|
||
text_parts.append(item.lower())
|
||
|
||
if context_text:
|
||
text_parts.append(context_text.lower())
|
||
|
||
combined_text = " ".join(text_parts)
|
||
|
||
self._correct_multimodal_sea_rail_for_shipment(shipment, combined_text)
|
||
|
||
requested_names = self._normalize_requested_shipping_type_names(
|
||
shipment.get("requested_shipping_type_names")
|
||
)
|
||
if requested_names:
|
||
candidates = self._build_candidates_from_type_names(requested_names)
|
||
if candidates:
|
||
shipment["shipping_type_candidates"] = candidates
|
||
shipment["requested_shipping_type_names"] = [
|
||
c["shipping_type"] for c in candidates
|
||
]
|
||
shipment["shipping_type"] = (
|
||
requested_names[0] if len(requested_names) == 1 else ""
|
||
)
|
||
shipment["criteria"] = (
|
||
candidates[0]["criteria"] if len(candidates) == 1 else ""
|
||
)
|
||
shipment["criteria_preview"] = ""
|
||
logger.info(
|
||
"shipping_type_candidates из LLM (requested_shipping_type_names): %s",
|
||
shipment["requested_shipping_type_names"],
|
||
)
|
||
self._reconcile_shipment_load_fields(shipment, combined_text)
|
||
continue
|
||
|
||
if self._text_requests_all_shipping_types(combined_text):
|
||
shipping_type_candidates: List[Dict] = []
|
||
for st in self.shipping_types:
|
||
if not isinstance(st, dict):
|
||
continue
|
||
name = st.get("name")
|
||
if not name:
|
||
continue
|
||
criteria_ai = st.get("criteria_ai") or st.get("criteria") or ""
|
||
shipping_type_candidates.append({
|
||
"shipping_type": name,
|
||
"match_score": 1,
|
||
"criteria": str(criteria_ai),
|
||
})
|
||
shipment["shipping_type_candidates"] = shipping_type_candidates
|
||
shipment["shipping_type"] = ""
|
||
shipment["criteria"] = ""
|
||
shipment["criteria_preview"] = ""
|
||
logger.info(
|
||
"Shipping types: letter requests all delivery modes → %d candidates (all configured types)",
|
||
len(shipping_type_candidates),
|
||
)
|
||
continue
|
||
|
||
# Тип перевозки задаёт LLM; здесь только подстановка criteria и согласование FCL/LCL.
|
||
llm_name = (shipment.get("shipping_type") or "").strip()
|
||
record = self._shipping_type_record_by_name(llm_name)
|
||
|
||
if record:
|
||
st_name = record.get("name") or ""
|
||
criteria_ai = record.get("criteria_ai") or record.get("criteria") or ""
|
||
cstr = str(criteria_ai)
|
||
shipment["shipping_type"] = st_name
|
||
shipment["criteria"] = cstr
|
||
shipment["criteria_preview"] = ""
|
||
shipment["shipping_type_candidates"] = [
|
||
{"shipping_type": st_name, "match_score": 0, "criteria": cstr}
|
||
]
|
||
self._reconcile_shipment_load_fields(shipment, combined_text)
|
||
continue
|
||
|
||
if llm_name:
|
||
shipment["criteria"] = ""
|
||
shipment["criteria_preview"] = ""
|
||
shipment["shipping_type_candidates"] = [
|
||
{"shipping_type": llm_name, "match_score": 0, "criteria": ""}
|
||
]
|
||
self._reconcile_shipment_load_fields(shipment, combined_text)
|
||
logger.warning(
|
||
"shipping_type %r не найден в конфигурации типов; criteria оставлено пустым",
|
||
llm_name,
|
||
)
|
||
continue
|
||
|
||
recipient_emails = self._recipients_for_shipment(shipment, sources or [])
|
||
recipient_st = self._pick_recipient_shipping_type(
|
||
recipient_emails, combined_text
|
||
)
|
||
if recipient_st:
|
||
criteria_ai = recipient_st.get("criteria_ai") or recipient_st.get("criteria") or ""
|
||
cstr = str(criteria_ai)
|
||
rname = recipient_st.get("name") or ""
|
||
shipment["shipping_type_candidates"] = [
|
||
{"shipping_type": rname, "match_score": 0, "criteria": cstr}
|
||
]
|
||
shipment["shipping_type"] = rname
|
||
shipment["criteria"] = cstr
|
||
shipment["criteria_preview"] = ""
|
||
logger.info(
|
||
"Shipping type по ящику получателя (LLM не указал тип): name=%s recipients=%s",
|
||
rname,
|
||
sorted(recipient_emails),
|
||
)
|
||
else:
|
||
shipment["shipping_type_candidates"] = []
|
||
shipment["shipping_type"] = ""
|
||
shipment["criteria"] = ""
|
||
shipment["criteria_preview"] = ""
|
||
logger.info("Shipping type: LLM пусто и нет сопоставления по получателям писем")
|
||
|
||
self._reconcile_shipment_load_fields(shipment, combined_text)
|
||
|
||
return structured_data
|
||
|
||
def _reconcile_shipment_load_fields(self, shipment: Dict, combined_text: str) -> None:
|
||
"""Согласует shipment_type (FCL/LCL) с выбранным shipping_type и текстом."""
|
||
if not isinstance(shipment, dict):
|
||
return
|
||
name = shipment.get("shipping_type") or ""
|
||
implied = _shipping_type_name_implies_load_mode(name)
|
||
if not implied:
|
||
return
|
||
t_lm = infer_container_load_mode_from_text((combined_text or "").lower())
|
||
cur_raw = shipment.get("shipment_type")
|
||
cur = (cur_raw or "").strip().upper() if isinstance(cur_raw, str) else ""
|
||
if cur not in ("LCL", "FCL"):
|
||
shipment["shipment_type"] = implied
|
||
return
|
||
if cur != implied:
|
||
if t_lm == cur:
|
||
return
|
||
if t_lm == implied:
|
||
shipment["shipment_type"] = implied
|
||
elif t_lm is None:
|
||
shipment["shipment_type"] = implied
|
||
|
||
def _save_report_to_file(self, session_id: str, data: Dict):
|
||
try:
|
||
reports_dir = "/opt/nek/app/reports"
|
||
os.makedirs(reports_dir, exist_ok=True)
|
||
|
||
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||
filename = os.path.join(reports_dir, f"{session_id}_{timestamp}.json")
|
||
|
||
with open(filename, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2, default=str)
|
||
|
||
logger.info(f"Report saved to {filename}")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to save report: {e}", exc_info=True)
|
||
|
||
def _prepare_template_data(self, shipment: Dict, missing_fields: List[str]) -> Dict:
|
||
def _first_non_empty_str(*candidates: Any, default: str = "не указан") -> str:
|
||
for c in candidates:
|
||
if c is None:
|
||
continue
|
||
s = str(c).strip()
|
||
if s:
|
||
return s
|
||
return default
|
||
|
||
_ct_src = shipment.get("container_type")
|
||
_ct_raw = str(_ct_src).strip() if isinstance(_ct_src, str) and _ct_src.strip() else ""
|
||
|
||
_crd = shipment.get("cargo_ready_date")
|
||
if isinstance(_crd, list):
|
||
_crd_str = ", ".join(str(x).strip() for x in _crd if x is not None and str(x).strip())
|
||
else:
|
||
_crd_str = str(_crd).strip() if _crd is not None else ""
|
||
|
||
data = {
|
||
"client_name": str(shipment.get("client_name") or "Клиент").strip() or "Клиент",
|
||
"cargo_ready_date": _crd_str or "не указана",
|
||
"pickup_address": str(shipment.get("pickup_address") or "не указан").strip() or "не указан",
|
||
"delivery_address": str(shipment.get("delivery_address") or "не указан").strip() or "не указан",
|
||
"total_weight_kg": shipment.get("total_weight_kg") if shipment.get("total_weight_kg") is not None else "не указан",
|
||
"package_count": shipment.get("package_count") if shipment.get("package_count") is not None else "не указан",
|
||
"total_volume_cbm": shipment.get("total_volume_cbm") if shipment.get("total_volume_cbm") is not None else "не указан",
|
||
"cargo_description": str(shipment.get("cargo_description") or "не указано").strip() or "не указано",
|
||
"hs_code": str(shipment.get("hs_code") or "не указан").strip() or "не указан",
|
||
# Три-стейт для строгой логики: неизвестно -> "Информация отсутствует"
|
||
"msds_required": (
|
||
"✅ Да"
|
||
if shipment.get("msds_required") is True
|
||
else ("❌ Нет" if shipment.get("msds_required") is False else "Информация отсутствует")
|
||
),
|
||
"missing_fields": "",
|
||
"estimated_cost": "по запросу",
|
||
"estimated_transit_time": "по запросу",
|
||
"dimensions_str": "не указаны",
|
||
"dangerous_goods_str": "Информация отсутствует",
|
||
"brand_authorization_info": "Информация отсутствует",
|
||
"loading_port": _first_non_empty_str(
|
||
shipment.get("loading_port"), shipment.get("pickup_address")
|
||
),
|
||
"discharge_port": _first_non_empty_str(
|
||
shipment.get("discharge_port"), shipment.get("delivery_address")
|
||
),
|
||
"shipment_type": _first_non_empty_str(
|
||
shipment.get("shipment_type"),
|
||
default="не указан",
|
||
),
|
||
"container_type": normalize_container_type_display(
|
||
_ct_raw or None,
|
||
empty="не указан",
|
||
),
|
||
"vehicle_type": _first_non_empty_str(shipment.get("vehicle_type"), default="не указан"),
|
||
"temperature_range": _first_non_empty_str(
|
||
shipment.get("temperature_range"), default="не указан"
|
||
),
|
||
"vehicle_dimensions_str": "не указаны",
|
||
}
|
||
|
||
# Габариты
|
||
dims = shipment.get("dimensions", [])
|
||
if isinstance(dims, list) and dims:
|
||
dim_strings = []
|
||
for d in dims:
|
||
if isinstance(d, dict):
|
||
l = d.get("length_cm")
|
||
w = d.get("width_cm")
|
||
h = d.get("height_cm")
|
||
if l and w and h:
|
||
dim_strings.append(f"{l}×{w}×{h} см")
|
||
if dim_strings:
|
||
data["dimensions_str"] = "; ".join(dim_strings)
|
||
|
||
vdims = shipment.get("vehicle_dimensions", [])
|
||
if isinstance(vdims, list) and vdims:
|
||
vdim_strings = []
|
||
for d in vdims:
|
||
if isinstance(d, dict):
|
||
l = d.get("length_cm")
|
||
w = d.get("width_cm")
|
||
h = d.get("height_cm")
|
||
if l and w and h:
|
||
vdim_strings.append(f"{l}×{w}×{h} см")
|
||
if vdim_strings:
|
||
data["vehicle_dimensions_str"] = "; ".join(vdim_strings)
|
||
|
||
# Опасные грузы
|
||
dg_labels = {
|
||
"batteries": "батарейки",
|
||
"gases": "газы",
|
||
"liquids": "жидкости",
|
||
"dry_ice": "сухой лёд",
|
||
}
|
||
dg_keys = ("batteries", "gases", "liquids", "dry_ice")
|
||
dg = shipment.get("dangerous_goods", {})
|
||
if isinstance(dg, dict):
|
||
yes = [dg_labels[k] for k in dg_keys if dg.get(k) is True]
|
||
explicit_no = [dg_labels[k] for k in dg_keys if dg.get(k) is False]
|
||
if yes:
|
||
data["dangerous_goods_str"] = "Да: " + ", ".join(yes)
|
||
elif len(explicit_no) == len(dg_keys):
|
||
data["dangerous_goods_str"] = "Нет"
|
||
elif explicit_no:
|
||
data["dangerous_goods_str"] = (
|
||
"Нет: " + ", ".join(explicit_no) + " — по остальным категориям информации нет"
|
||
)
|
||
else:
|
||
note = shipment.get("dangerous_goods_note")
|
||
if isinstance(note, str) and note.strip():
|
||
data["dangerous_goods_str"] = note.strip()
|
||
else:
|
||
data["dangerous_goods_str"] = "Информация отсутствует"
|
||
|
||
# Авторизационное письмо бренда — выводим максимально полный контекст.
|
||
brand_info = shipment.get("brand_authorization_info")
|
||
if isinstance(brand_info, str) and brand_info.strip():
|
||
data["brand_authorization_info"] = brand_info.strip()
|
||
data["brand_authorization_letter"] = brand_info.strip()
|
||
else:
|
||
brand_flag = shipment.get("brand_authorization_letter")
|
||
if brand_flag is True:
|
||
fallback = "Требуется авторизационное письмо бренда, подробности в письме не найдены."
|
||
elif brand_flag is False:
|
||
fallback = "Авторизационное письмо бренда не требуется (по переписке)."
|
||
else:
|
||
fallback = "Информация отсутствует"
|
||
data["brand_authorization_info"] = fallback
|
||
data["brand_authorization_letter"] = fallback
|
||
|
||
# Стоимость и срок
|
||
shipping_options = shipment.get("shipping_options", [])
|
||
if isinstance(shipping_options, list) and shipping_options:
|
||
first_opt = shipping_options[0]
|
||
if isinstance(first_opt, dict):
|
||
cost = first_opt.get("cost")
|
||
transit = first_opt.get("transit_time")
|
||
if cost:
|
||
data["estimated_cost"] = str(cost).strip() or "по запросу"
|
||
if transit:
|
||
data["estimated_transit_time"] = str(transit).strip() or "по запросу"
|
||
|
||
# Формирование списка недостающих полей
|
||
if missing_fields:
|
||
missing_labels = [FIELD_LABELS.get(f, f) for f in missing_fields]
|
||
if len(missing_labels) > 1:
|
||
data["missing_fields"] = "\n".join(f"- {item}" for item in missing_labels)
|
||
else:
|
||
data["missing_fields"] = missing_labels[0] if missing_labels else ""
|
||
|
||
return data
|
||
|
||
def detect_documents(self, sources):
|
||
"""
|
||
Находит документы в прикреплённых файлах.
|
||
Важное правило: один файл не должен одновременно попадать в MSDS и DGM
|
||
(на практике это почти всегда отдельные файлы).
|
||
"""
|
||
result = {
|
||
"msds": [],
|
||
"dgm": [],
|
||
"brand_authorization": []
|
||
}
|
||
|
||
keywords = {
|
||
"msds": [
|
||
"msds", "material safety data sheet", "sds", "safety data sheet"
|
||
],
|
||
"dgm": [
|
||
"dangerous goods declaration", "dgm", "dgd", "shipper's declaration",
|
||
"imo declaration", "imdg declaration"
|
||
],
|
||
"brand_authorization": ["authorization letter", "authorisation letter", "brand authorization"]
|
||
}
|
||
|
||
def score_doc(doc_type: str, filename_l: str, text_l: str) -> int:
|
||
words = keywords.get(doc_type, [])
|
||
score = 0
|
||
if any(w in filename_l for w in words):
|
||
score += 3
|
||
if any(w in text_l for w in words):
|
||
score += 1
|
||
return score
|
||
|
||
def classify_attachment(filename_l: str, text_l: str) -> Optional[str]:
|
||
scores = {k: score_doc(k, filename_l, text_l) for k in keywords.keys()}
|
||
if all(v == 0 for v in scores.values()):
|
||
return None
|
||
|
||
# Спец-логика MSDS vs DGM: не разрешаем одной вложке быть сразу обоими
|
||
msds_s = scores.get("msds", 0)
|
||
dgm_s = scores.get("dgm", 0)
|
||
if msds_s > 0 and dgm_s > 0:
|
||
# приоритет по названию файла
|
||
if any(w in filename_l for w in keywords["msds"]) and not any(w in filename_l for w in keywords["dgm"]):
|
||
scores["dgm"] = 0
|
||
elif any(w in filename_l for w in keywords["dgm"]) and not any(w in filename_l for w in keywords["msds"]):
|
||
scores["msds"] = 0
|
||
else:
|
||
# иначе выбираем что более уверенно, при равенстве — считаем DGM более специфичным
|
||
if dgm_s >= msds_s:
|
||
scores["msds"] = 0
|
||
else:
|
||
scores["dgm"] = 0
|
||
|
||
best_doc = max(scores.items(), key=lambda kv: kv[1])[0]
|
||
return best_doc if scores[best_doc] > 0 else None
|
||
|
||
seen: set = set() # (email_id, filename, doc_type)
|
||
|
||
for src in sources or []:
|
||
for att in src.get("attachments", []) or []:
|
||
filename = att.get("filename") or ""
|
||
filename_l = filename.lower()
|
||
text_l = (att.get("text") or "").lower()
|
||
|
||
doc_type = classify_attachment(filename_l, text_l)
|
||
if not doc_type:
|
||
continue
|
||
|
||
key = (src.get("id"), filename, doc_type)
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
|
||
result[doc_type].append({
|
||
"filename": filename,
|
||
"email_subject": src.get("subject"),
|
||
"email_id": src.get("id")
|
||
})
|
||
|
||
return result
|
||
|
||
def _generate_response_letter(self, structured_data: Any) -> str:
|
||
"""
|
||
Генерирует письмо для клиента (для отображения в отчете).
|
||
В отчете всегда формируется client letter по info_request_template.
|
||
"""
|
||
def _clean_text(text: str) -> str:
|
||
if not text:
|
||
return ""
|
||
lines = text.splitlines()
|
||
result = []
|
||
prev_empty = False
|
||
for line in lines:
|
||
stripped = line.strip()
|
||
if stripped == "":
|
||
if not prev_empty:
|
||
result.append("")
|
||
prev_empty = True
|
||
else:
|
||
result.append(line.rstrip())
|
||
prev_empty = False
|
||
return "\n".join(result).strip()
|
||
|
||
def _extract_greeting_and_signature(letter_text: str) -> tuple[str, str, str]:
|
||
text = _clean_text(letter_text or "")
|
||
if not text:
|
||
return "", "", ""
|
||
|
||
lines = text.splitlines()
|
||
|
||
greeting_lines: List[str] = []
|
||
i = 0
|
||
while i < len(lines) and lines[i].strip() != "":
|
||
greeting_lines.append(lines[i].rstrip())
|
||
i += 1
|
||
greeting = "\n".join(greeting_lines).strip()
|
||
|
||
sig_idx = None
|
||
for idx in range(len(lines) - 1, -1, -1):
|
||
low = lines[idx].strip().lower()
|
||
if low.startswith("с уважением") or low.startswith("best regards"):
|
||
sig_idx = idx
|
||
break
|
||
if sig_idx is not None:
|
||
signature = "\n".join(lines[sig_idx:]).strip()
|
||
body_lines = lines[i:sig_idx]
|
||
else:
|
||
signature = ""
|
||
body_lines = lines[i:]
|
||
|
||
body = _clean_text("\n".join(body_lines))
|
||
return greeting, body, signature
|
||
|
||
if not isinstance(structured_data, dict):
|
||
logger.error(f"Expected dict for structured_data, got {type(structured_data)}")
|
||
return "Ошибка: не удалось получить данные для формирования письма."
|
||
|
||
shipments = structured_data.get('shipments', [])
|
||
|
||
if not isinstance(shipments, list) or not shipments:
|
||
return "Не удалось извлечь данные о грузоперевозках из писем."
|
||
|
||
shipping_types = load_shipping_types()
|
||
if not shipping_types:
|
||
return "Ошибка: шаблоны писем не найдены в конфигурации."
|
||
|
||
sections: List[str] = []
|
||
greeting = ""
|
||
signature = ""
|
||
|
||
valid_shipments = [s for s in shipments if isinstance(s, dict)]
|
||
if not valid_shipments:
|
||
return "Не удалось извлечь данные о грузоперевозках из писем."
|
||
|
||
def _candidates_for_letter(sh: Dict) -> List[Dict]:
|
||
cands = sh.get("shipping_type_candidates")
|
||
if isinstance(cands, list) and cands:
|
||
return [c for c in cands if isinstance(c, dict)]
|
||
st_name = sh.get("shipping_type") or ""
|
||
return [{"shipping_type": st_name, "criteria": sh.get("criteria", "")}]
|
||
|
||
default_type = shipping_types[0]
|
||
|
||
section_idx = 0
|
||
for i, shipment in enumerate(valid_shipments, start=1):
|
||
required_fields = [
|
||
"client_name", "incoterms", "pickup_address", "cargo_value",
|
||
"package_count", "total_weight_kg", "dimensions", "total_volume_cbm",
|
||
"cargo_description", "delivery_address"
|
||
]
|
||
|
||
missing: List[str] = []
|
||
for field in required_fields:
|
||
val = shipment.get(field)
|
||
if field == "dimensions":
|
||
if not val or not isinstance(val, list) or not val:
|
||
missing.append(field)
|
||
elif (
|
||
val is None
|
||
or (isinstance(val, (int, float)) and float(val) == 0)
|
||
or (isinstance(val, str) and val.strip() in {"", "0", "0.0", "0,0", "0,00"})
|
||
):
|
||
missing.append(field)
|
||
|
||
# Под отчетами показываем только клиентское письмо.
|
||
# Письмо контрагенту (confirmation_template) не включаем в report output.
|
||
template_key = "info_request_template"
|
||
|
||
for variant in _candidates_for_letter(shipment):
|
||
type_name = (variant.get("shipping_type") or "").strip() or "Тип не указан"
|
||
selected_type = None
|
||
if type_name and type_name != "Тип не указан":
|
||
selected_type = next(
|
||
(st for st in shipping_types if st.get("name") == type_name),
|
||
None,
|
||
)
|
||
if not selected_type:
|
||
selected_type = default_type
|
||
|
||
template = selected_type.get(template_key, "")
|
||
if not template:
|
||
continue
|
||
|
||
variant_missing = list(missing)
|
||
variant_missing.extend(
|
||
collect_extra_required_missing(shipment, selected_type)
|
||
)
|
||
variant_missing = list(dict.fromkeys(variant_missing))
|
||
|
||
template_data = self._prepare_template_data(shipment, variant_missing)
|
||
|
||
try:
|
||
letter_one = auto_fill_template(template, template_data, FIELD_LABELS)
|
||
except Exception as e:
|
||
logger.error(f"Template formatting error: {e}", exc_info=True)
|
||
letter_one = "Произошла ошибка при формировании текста письма."
|
||
|
||
g, body, sig = _extract_greeting_and_signature(letter_one)
|
||
if not greeting and g:
|
||
greeting = g
|
||
if not signature and sig:
|
||
signature = sig
|
||
|
||
section_body = body if body else _clean_text(letter_one)
|
||
section_idx += 1
|
||
sections.append(
|
||
f"Перевозка ({section_idx}) — {type_name}\n\n{section_body}".strip()
|
||
)
|
||
|
||
combined = "\n\n".join(sections).strip()
|
||
result_parts = [p for p in [greeting, combined, signature] if p]
|
||
return _clean_text("\n\n".join(result_parts)) if result_parts else "Не удалось сформировать письмо по перевозкам."
|
||
|
||
async def generate_cargo_report(self, session_id: str) -> Dict:
|
||
query = """
|
||
Составь полный структурированный отчёт о всех грузоперевозках из предоставленных писем.
|
||
Для каждой перевозки укажи все доступные детали: маршрут, характеристики груза,
|
||
особые условия, требуемые документы и предложи варианты доставки.
|
||
"""
|
||
def _clean_emails_for_learning(emails):
|
||
cleaned = []
|
||
for e in emails:
|
||
e = dict(e)
|
||
|
||
# удаляем base64
|
||
for att in e.get("attachments", []):
|
||
att.pop("content_base64", None)
|
||
|
||
# режем тело письма
|
||
if isinstance(e.get("body"), str):
|
||
e["body"] = e["body"][:5000]
|
||
|
||
cleaned.append(e)
|
||
return cleaned
|
||
context_text, sources = self._collect_session_sources(session_id)
|
||
result = await self.query_cargo_info(query, session_id, top_k=50)
|
||
letter = self._generate_response_letter(result.get('structured_data', {}))
|
||
result['generated_letter'] = letter
|
||
result = self._normalize_api_report_payload(result)
|
||
result['context_text'] = context_text
|
||
result['emails'] = sources
|
||
if session_id:
|
||
self._save_report_to_file(session_id, result)
|
||
return result
|