U
    qmi                    @   s  d dl T d dlm  mZ d dlmZmZmZmZ d dl	Z	d dl
Z
d dlZd dlmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ e Zddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJdKdLdMdNdOdPdQdRdSdTdUdVdWdXdYdZd[d\d]d^d_d`dadbdcdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdwdxdydzd{d|d}d~dddddddddddddddddddddddddddddddgZddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd ddddddddd	d
ddddddddddddddddddgZ ddeddrd`dJd/ddRg	Z!dd d!d"d#gZ"d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3gZ#d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCgZ$e%ee  e" Z&dDdEdFdGdHdIdJgZ'eG dKdL dLZ(G dMdN dNZ)G dOdP dPe)Z*G dQdR dRe)Z+G dSdT dTZ,dS (U      )*N)ListSetDictOptional)	dataclassfieldfields)
namedtuple)BeautifulSoup   иu   вu   воu   неu   чтоu   онu   наu   яu   сu   соu   какu   аu   тоu   всеu   онаu   такu   егоu   ноu   даu   тыu   кu   уu   жеu   выu   заu   быu   поu   толькоu   ееu   мнеu   былоu   вотu   отu   меняu   ещеu   нетu   оu   изu   емуu   теперьu
   когдаu   дажеu   нуu
   вдругu   лиu   еслиu   уже   илиu   ниu   бытьu   былu   негоu   доu   васu   нибудьu
   опятьu   ужu   вамu   ведьu   тамu
   потомu   себяu   ничегоu   ейu
   можетu   ониu   тутu   гдеu   естьu   надоu   нейu   дляu   мыu   тебяu   ихu   чемu   былаu   самu   чтобu   безu
   будтоu   чегоu   разu   тожеu   себеu   подu
   будетu   жu
   тогдаu   ктоu   этотu   тогоu   потомуu
   этогоu
   какойu   совсемu   нимu
   здесьu   этомu   одинu
   почтиu   мойu   темu
   чтобыu   нееu   сейчасu   былиu   кудаu
   зачемu   всехu   никогдаu
   можноu   приu   наконецu   дваu   обu   другойu   хотьu
   послеu   надu   большеu   тотu
   черезu   этиu   насu   проu
   всегоu   нихu
   какаяu
   многоu
   развеu   триu   этуu   мояu   впрочемu   хорошоu   своюu   этойu
   передu   иногдаu
   лучшеu   чутьu   томu   нельзяu
   такойu   имu
   болееu   всегдаu   конечноu   всюu
   междуimeZmyZmyselfZweZourZoursZ	ourselvesZyouZyourZyoursZyourselfZ
yourselvesheZhimZhisZhimselfZsheZherZhersZherselfitZitsZitselfZtheyZthemZtheirZtheirsZ
themselvesZwhatwhichZwhoZwhomthisthatZtheseZthoseamisZareZwasZwerebeZbeenZbeingZhaveZhasZhadZhavingZdoZdoesZdidZdoingaZanZtheandZbutiforZbecauseasZuntilwhileZofZatZbyforwithZaboutZagainstZbetweenZintoZthroughZduringbeforeZafterZaboveZbelowtofromZupZdowninoutZonZoffZoverZunderZagainZfurtherZthenoncehereZthereZwhenwherewhyZhowallanyZbothZeachZfewZmoreZmostotherZsomeZsuchnoZnornotZonlyZownsameZsoZthanZtooZverystZcanZwillZjustZdonZshouldZnowu   почемуu   можешьdocZwwwhttphttpsmailtou
   авторu   бизнес-процессu   владелецu   датu   исполнителu   контрагентu   логическu
   отделu   постановщикu   приоритетu   проектu   процессu
   спискu   стандартнu   статусu   типZapprovbaseclosedefaultZdocumentopentasku   бизнесu   документu
   задачu   задачаu   закрытu
   обычнu   системu   созданu   черновикtextml_textnametagscommentsZaddon_fieldsZkey_phrasesc                   @   s  e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed< dZeed	< dZeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZ	eed< dZeed< dZeed< dZ
eed< dZeed< dZeed< dZeed< dZ eed< dZ!eed< dZ"eed< dZ#eed < dZeed< dZ$eed!< dZeed	< dZ%eed"< dZ&eed#< dZ'eed$< dZ(eed%< dZ)eed&< dZ*eed'< dZ+eed(< dZ,eed)< dZ-eed*< dZ.eed+< dZ/eed,< dZ0eed-< dZ1eed.< dZ2eed/< dZ3eed0< dZ4eed1< dZ5eed2< dS )3DriverSearchObjectN	obj_modelobj_idobj_codeobj_nameobj_modified_atobj_project_idobj_related_person_loginsobj_ml_textobj_textobj_tagsobj_logic_type_codeobj_activity_codeobj_status_typeobj_result_textobj_commentsobj_owner_nameobj_responsible_namesobj_hrefobj_parent_idobj_tree_parent_idobj_root_parent_idobj_created_atobj_deletedobj_archivedobj_author_nameobj_modified_by_nameobj_addon_fieldsobj_user_ratingobj_key_phrasesobj_company_idobj_breadcrumbs"obj_related_person_logins_tsvectorname_tsvectortext_tsvectortags_tsvectorresult_text_tsvectorcomments_tsvectoraddon_fields_tsvectorkey_phrases_tsvectorml_text_tsvectorheadlineheadline_rawtitlebreadcrumbslabelrankage_daysdebug_labelmark)6__name__
__module____qualname__r@   str__annotations__rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp    rv   rv   !./cmf/models/cmf_search_engine.pyr?   U   sj   
r?   c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )SearchDriverBasec                 C   s   t d S NNotImplementedselfsearchrv   rv   rw   select_empty_query_top   s    z'SearchDriverBase.select_empty_query_topc                 C   s   t d S ry   rz   r|   rv   rv   rw   select_empty_query   s    z#SearchDriverBase.select_empty_queryc                 C   s   t d S ry   rz   r|   rv   rv   rw   select_bm25   s    zSearchDriverBase.select_bm25c                 C   s   t d S ry   rz   r|   rv   rv   rw   select_default   s    zSearchDriverBase.select_defaultc                 C   s   t d S ry   rz   r|   rv   rv   rw   select_attachment   s    z"SearchDriverBase.select_attachmentN)rq   rr   rs   r   r   r   r   r   rv   rv   rv   rw   rx      s
   rx   c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
SearchDriverEvaFullSearchc                 C   s.   t j|}g }|D ]}|tf | q|S ry   )modelsCmfFullSearchZfilter_empty_topappendr?   )r}   r~   Z
found_objsres	found_objrv   rv   rw   r      s    z0SearchDriverEvaFullSearch.select_empty_query_topc           	      C   s  g }g }g }g }g }|j dks(|j dkr^tjj|jttjjdd|d}tjj|jdd|d}|j dksr|j dkrtjj|j	ttjjdd|d}tjj|j	dd|d}|j dks|j dkrtjj|j
d	dd|d
}g }|| | | | D ]}|tf | q|S )NANYCmfTaskZEMPTY   )Zforce_related_person_loginrl   rn   r~   )rl   rn   r~   CmfDocumentr   r   r:   )Zforce_field_namerl   rn   r~   )
model_namer   r   Zfilter_onceAKA_TASK_MODELSrt   gcurrent_userloginAKA_DOC_MODELSAKA_DICT_MODELSr   r?   )	r}   r~   task_empty_query_related_usertask_empty_querydoc_empty_query_related_userdoc_empty_queryZother_empty_queryr   r   rv   rv   rw   r      sZ    

z,SearchDriverEvaFullSearch.select_empty_queryc                 C   s  g }g }g }|j dks |j dkr@tjj|j|jddgdd|d}|j dksT|j dkrttjj|j|jddgdd|d}d	}|j dkrd
}|j dkrtjj|j|jd|gd|d}n*|j dkrtjj|j g|jd|gd|d}g }|| | D ]}|t	f | q|S )Nr   r   r   d   ZB25SF)force_slicerl   Zinclude_attachmentr~   r         ZTOP)r   rl   r~   r   )
r   r   r   Zsearch_once_top_bm25r   tsquery_without_synr   r   r   r?   )r}   r~   result_tasksresult_docsresult_dictZother_slicer   r   rv   rv   rw   r     sH        

  
  z%SearchDriverEvaFullSearch.select_bm25c           
      C   sT  g }g }g }g }g }g }|j dkr<tjj|j|jd|d}n$|j dkr`tjj|j g|jd|d}|j dkst|j dkrtjj|j|jdd|d}tjj|j|jd|d}|j dks|j d	krtjj|j|jdd|d}tjj|j|jd|d}|j dks|j d
krtjj|j	|jd|d}g }|| | | | | D ]}	|
tf |	 q8|S )Nr   ZMAIN)rl   r~   r   r   ZSYNsyn)rl   rp   r~   r   CmfAttachment)r   r   r   Zsearch_oncer   r   r   tsquery_with_synr   AKA_ATTACHMENT_MODELSr   r?   )
r}   r~   r   result_tasks_synr   result_docs_synr   result_attachsr   r   rv   rv   rw   r   <  sf    

      z(SearchDriverEvaFullSearch.select_defaultN)rq   rr   rs   r   r   r   r   rv   rv   rv   rw   r      s   
2r   c                   @   s   e Zd ZdS )SearchDriverElasticSearchN)rq   rr   rs   rv   rv   rv   rw   r     s   r   c                   @   sf  e Zd ZdZdgZdgZdgZdZe Z	dd Z
edFd	d
Zdd Zdd Zedd ZedddddZdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zd0d1 Zd2d3 Z ed4d5 Z!d6d7 Z"ee#e#e$d8d9d:Z%ed;d<d=d>Z&edGd@dAZ'edBdC Z(edHdDdEZ)dS )ICmfSearchEngineu  
    Главная задача: выполнение поиска (без учета дата-драйвера PG или Elastic).

    Алгоритм:
    - Подготовка запроса
    - Запрос делаем через дата-драйвер, получаем сырые предварительные данные (х10 объема)
    - Сортируем по классам
    - Сортируем по bm25eva
    - Проверяем права
    - Смешиваем потоки (оптимизированно с проверкой прав)
    - Формируем вывод

    Драйвер:
    - Уметь делать запросы
    - Не обязательно, но желательно: давать статистику bm25 (даже если используется Elastic,
          мы храним данные в нашей таблице все равно и можем посчитать bm25 статистику из нее)
    r   r   r   Nc                 C   s   d| _ d| _d | _d | _d| _ddg| _d | _d| _d | _d| _	d| _
d| _d| _d | _d | _i | _t | _d | _d | _d | _d | _| jjd krdd tjj D | j_ttds|   d S )	Nr   Fr   r    c                 S   s"   g | ]}|j r|jd kr|jqS ))r   r   r   )Zfull_search
class_name).0mrv   rv   rw   
<listcomp>  s     
z,CmfSearchEngine.__init__.<locals>.<listcomp>FTS_STAT_WORDS)r   orig_field_name
field_nameorig_search_queryonly_idsslicer	   no_analitycscheck_access_custom_fntoparchiveddeletedFSTorig_tag_nameorig_tree_parent_idaddon_filtersetrecent_projecttree_parent_filterquery_qstop_wordssorted_resultfinal_result	__class__r   Zcmfr   Z	CmfEntityZiter_subclasseshasattrAPPbm25_stat_words_loadr}   rv   rv   rw   __init__  s2    

zCmfSearchEngine.__init__Fc                 K   s  t |dkr|d d }|d kr$d}|r<|d dkr<ddg}|sHddg}t }||_||_||_||_||_||_||_||_	||_
|	|_|
|_||_||_||_||_tjjdd	}|d
krtd|  |  |  t||_d|jd< |jdr|jd d |_|jdkr d|_|jdkr8d|_d|_|rJt||_nt |_d|_|jdkrv|jrv|  n0|jdkr|  n|jr|  n|  |   |j!S )Ni   r         r   2   r   T)Zis_dirty   u   Идет процесс индексации, могут быть доступны не все результаты поиска. Осталось объектов: ZModelr   r;   Z
CmfCommentr>   )"lenr   r   r   r   r   r   r   r	   r   r   r   r   r   r   r   r   r   r   countZcmf_noteprepare_search_queryprepare_additional_filterlistZfullsearch_sliceendswithr   r   search_empty_top_recentsearch_empty
search_topsearch_main_prepare_final_resultr   )clsr   r   search_queryr   r   r	   r   Zcheck_accessr   r   r   tag_nametree_parent_idZrecent_projectskwargsr~   Zdirty_countrv   rv   rw   
fts_search  sf    
	




zCmfSearchEngine.fts_searchc                 C   s  t dd| j | _t dd| j| _t dd| j | _d | _d | _t	| j
 dkrt d| jsxt d| jr| j | _t d	| jr| j| _| j| jd
dd| _| | j| _| jd k	s| jd k	r| j| _t dd| j| _| j| _d | _d | _d S | | j| _t | _| j
 D ]}t|}| j| q$t| j| _| | j| _d | _| j| _| | j| _| j| _| j| _d S )Nz -[^ ]*r   z[|&()] u;   ([^A-Za-zА-Яа-я0-9](\s|$)|(\s|^)[^A-Za-zА-Яа-я0-9])r   z^[a-zA-Z0-9]+-[0-9]+$z^[0-9]+$z@^[a-zA-Z0-9]*:[0-9a-f]*-[0-9a-f]*-[0-9a-f]*-[0-9a-f]*-[0-9a-f]*$FT)synonyms
stop_wordsu   [^А-Яа-яA-Za-z0-9.-]-) resubr   lowerZsearch_query_wo_qstopr   stripZquery_obj_codeZquery_obj_idr   splitmatchupperparse_search_queryr   text_stemmingZsearch_query_ninjaZsearch_query_normalizer   r   Z
query_likequery_remove_stop_wordsZsearch_query_wo_stop_wordsr   search_query_partspeechcmfnlp
partspeechr   cmfutilZninjaquery_normalizeZsearch_query_stemm)r}   wr   rv   rv   rw   r   S  s>    
z$CmfSearchEngine.prepare_search_queryc                 C   s$   |  | j| j| _| | j| _d S ry   )	calc_tagsr   r   r   _get_all_branchesr   r   r   rv   rv   rw   r     s    z)CmfSearchEngine.prepare_additional_filterc                 C   sP   t  }|r4t|tr*|D ]}|| qn
|| | |\}}t||S ry   )r   
isinstancer   add_extract_tagsunion)r   r   r   r=   Z_tagZextracted_tagsrv   rv   rw   r     s    

zCmfSearchEngine.calc_tagsrt   ztuple[str, set[str]])r   returnc                 C   s   t  }| d}t|dkr$| |fS d} |dd D ]Z}|dkrBq4td|d}|d dkrj||d  t|dkr4|  |d  |d  } q4|  } | |fS )	zExtrats tags from the given search_query and returns its reminder and a set of extracted tags

        Args:
            search_query (str)

        Returns:
            tuple[str, list[str]]: search_query reminder and a set of extractd tags
        #r   r   Nz(\W)r   r      )r   r   r   r   r   r   )r   r=   Zsharp_splittedtokenZ
sub_tokensrv   rv   rw   r     s    

zCmfSearchEngine._extract_tagsc                 C   sx   g }g }g }g }|D ]V}|j | jkr0|| q|j | jkrH|| q|j | jkr`|| q|| q||||fS ry   )r@   r   r   r   r   )r}   search_objectsZ	aka_tasksZaka_docsZaka_dictZ
aka_attachsearch_objectrv   rv   rw   _split_search_objects_to_4_aka  s    z.CmfSearchEngine._split_search_objects_to_4_akac                 C   sr   g }g }t jr&t jjjr&t jjjdkr.g |fS tt jj}|D ]*}|jr^||jkr^|| q>|| q>||fS )Nr   )r   r   r   valuert   rF   r   )r}   r   ZrelatedZnot_related
user_loginr   rv   rv   rw   %_split_search_objects_to_related_user  s    z5CmfSearchEngine._split_search_objects_to_related_userc                 C   s:   g }g }|D ]$}|j |kr&|| q|| q||fS ry   )rp   r   )r}   r   keyrp   Znomarkr   rv   rv   rw   _split_search_objects_by_mark  s    
z-CmfSearchEngine._split_search_objects_by_markc                 C   s   g }t  }t }g }| j| }| |\}}}}dd }	ddd}
t|}|	||
| |	||
| |	||
| |	||
| |	||
| |	||
| t|}|dks||krFqqF|d d | _t | dkrtdt |   d S )	Nc                 S   sJ   |d |d krd S |sd S | d}d|_| | |d  d7  < d S )Nr   total_limitr   Z	empty_topr   popro   r   r   countersobj_listobjrv   rv   rw   append_if_exists_empty_top  s    

zKCmfSearchEngine.search_empty_top_recent.<locals>.append_if_exists_empty_topr      r   r  r   皙?z3PROF fulltext_search empty_queries_top selects got )	r   timedriverr   r  r   r   r   debug)r}   r   skip_idsprof_str   Ztop_results_tasksZtop_results_docsZtop_results_dictsZtop_results_attachr  Zcounters_empty_topres_count_beforeres_count_afterrv   rv   rw   r     s*    
z'CmfSearchEngine.search_empty_top_recentc                 C   s   g }t  }t }| j| }| |\}}}}| |\}	}
| |\}}d}t|	|
|||gr|	df|
df|df|df|df|dff}| ||| |d7 }|dkrPt	d qqPt | dkrt	dt |   || _
d S )Nr   r     W   Баг в поиске, много данных, либо не идет вычитка.r  z&PROF fulltext_search search_empty got )r   r  r  r   r  r  r*   _add_if_existsr   r  r   )r}   r   r  r  r   Zresults_tasksZresults_docsZresults_dictsresults_attachr   r   r   r   
iter_countprocessing_grouprv   rv   rw   r     s8     	
zCmfSearchEngine.search_emptyc                    s  t   }g }t }jdkr&g _d S jdkr:g _d S j}|\}}}}|d\}}	|d\}}
|d\}}|d\}}|}|}~~	|j
}	|j
}	|	j
}		|
j
}
	|j
}	|j
}t   | dkr"tdt   |   t   }t }g }i   fdd}d	d
 }ddddddd}ddd}t|dkr|d jdkr||||d  t|dkr|d jdkr||||d  t|dkr|d jdkr||||d  t|dkr|d jdkr||||d  t|	dkr6||||	d  t|
dkrR||||
d  d}d}d}d}t|dkrz|d j}t|dkr|d j}t|dkr|d j}t|dkr|d j}|}|}|}|}d}||ks||krd}tj dkr&|||| t|}|r t|dkrt|dkr|d j|d jkr||||| ||||| n||||| ||||| ||||| ||||| |||| ||||	d  ||||
d  ||||| ||||| ||||| ||||| nt|dkrpt|dkrp|d j|d jkrp||||| ||||| n||||| ||||| ||||| ||||| ||||| ||||| |||| ||||
d  ||||	d  ||||| ||||| ||||| ||||| t|}|dksH||kr&qHq&|d d _t   | dkrtdt   |   d S )Nr>   r   Zflow_idcodelike	flow_namer  z'PROF fulltext_search TOP25 selects got c                    s   t tjj}|sd S |d}|j kr<|s0d S |d}qd |j< |r\|j| dk r\d S d}|jjkrpd}|| || d krd S | 	| ||  d7  < d S )Nr   T皙?r1   r9   Z_limitr   )
rt   r   r   r   r	  rA   real_rank_bm25r@   r   r   )r   r  r  Zmax_rankr  r  Z	cur_classZexists_dictr}   rv   rw   append_if_exists  s&    


	
z4CmfSearchEngine.search_top.<locals>.append_if_existsc                 S   sJ   |d |d krd S |sd S | d}d|_| | |d  d7  < d S )Nr   r  r   r+   r   r  r
  rv   rv   rw   append_if_exists_other  s    

z:CmfSearchEngine.search_top.<locals>.append_if_exists_otherr   r     )r1   r9   cZ	doc_limitZ
task_limitZc_limitr   r    TFr   r   z2PROF fulltext_search TOP25 mixing and prepare got )r  r   r   r   r   r  r   r  r  	bm25_sortr   r   r  r   	rank_bm25r!  bm25_sort_date4r   r   )r}   r  r   r  r   r   r   result_dictsr  Zresult_docs_idcodelikeZresult_tasks_idcodelikeZresult_docs_nameZresult_tasks_nameZresult_docs_mainZresult_tasks_mainr   r#  r$  r  Zcounters_otherZmax_doc_name_rankZmax_task_name_rankZmax_doc_main_rankZmax_task_main_rankZdoc_prior  r  rv   r"  rw   r   H  s    

)  








zCmfSearchEngine.search_topc                 C   s\  g }t  }t }| j| }| |\}}}}| |d\}}	| |d\}}
t | dkrxtdt |   t }d}t|||||	|
gr |df|df|df|df|	df|df|	df|df|df|
df|df|
df|df|dfg}| 	||| |d7 }|dkrtd q q|d d	 | _
t | dkrXtd
t |   d S )Nr   r  z&PROF fulltext_search main selects got r      r   r  r  r   z1PROF fulltext_search main mixing and prepare got )r   r  r  r   r  r  r   r  r*   r  r   )r}   r   r  r  r   r   r   r+  r   r   r   r  r  rv   rv   rw   r   3  sF    

zCmfSearchEngine.search_mainc                 C   sN   |S ]:}|sq|d dkr4t|dks|d dkr4q|| qd|}|S )u   
        Подчистка оригинального квери, который ввел пользователь:
        - удаление стоп-слов
        r   r   r   r   )r   r   r   join)r   r   Zclean_search_query_listr   Zclean_search_queryrv   rv   rw   _clean_search_queryr  s     
z#CmfSearchEngine._clean_search_queryc                 C   s,   d}|  D ]}|tkrq|d| 7 }q|S )Nr   r   )r   ALL_STOP_WORDS)r   queryr   r   rv   rv   rw   r     s    z'CmfSearchEngine.query_remove_stop_wordsc                 C   s   d}|  D ]}|tkrqd|kr0|d| 7 }qt|}|dkrDq|dddd}||krjd}d}nd| }||kr|d| | 7 }qtj tdddk r|d | d | }|dd|   7 }qd|	   }t
d	d
|}|S )Nr   |r   @.i     r   u   [^ A-Za-zА-Яа-я0-9|-]r   )r   r/  r   Ztext_normalize_enrichreplacedatetimedateZtodayr-  r   r   r   )r   r0  r   r   ZlemmaZlegacy_wZlegacy_w_strrv   rv   rw   r     s,    
zCmfSearchEngine.query_normalizec                 C   s   t ||d  S )Nr   )mathlog)r   NZdfrv   rv   rw   bm25_idf  s    zCmfSearchEngine.bm25_idfc                 C   s&  i }i }t jjdd|d< t jjdd|d< t jjj d }t jjj d }|t	|d d  d }|t	|d	 d  d }t	|d
kr|d d }|d d }|t	|d d  d }|t	|d	 d  d }t	|d
kr|d d }|d d }d}	|D ]:}
i ||
j
< | |d |
j||
j
 d< ||
j
 d }q|D ]F}
|
j
|krhi ||
j
< | |d |
j||
j
 d< ||
j
 d }qNi |d< ||d d< ||d d< i |d< i |d< || d |d d< || d |d d< || d |d d< || d |d d< |t_|t_d S )Nr   )r@   r   z SELECT word, ndoc, nentry  FROM ts_stat('SELECT ml_text_tsvector from cmf_full_search where obj_model=''CmfDocument''  ') where  ndoc > 10  order by ndoc desc limit 10000 z SELECT word, ndoc, nentry  FROM ts_stat('SELECT ml_text_tsvector from cmf_full_search where obj_model=''CmfTask''  ') where  ndoc > 10  order by ndoc desc limit 10000 r   r   r   
   i  i  i  ditiFST_STAT_WORDS_DEFAULTZFST_FRQ_MIDZFST_FRQ_LOW)r   r   r   ZCmfGlobalSettingsdpdata_driverSessionexecuteZfetchallr   wordr;  Zndocr   r   
FTS_COUNTS)r   r   rE  Zcmfdoc_statZcmftask_statZdoc_mid_wordZdoc_low_wordZtask_mid_wordZtask_low_wordlastrowZlast_docZ	last_taskrv   rv   rw   r     sJ    

z$CmfSearchEngine.bm25_stat_words_loadc                 C   s   |}t |dkrd}|d j|d j dkr2d}n:|d j|d j dkrPd}n|d j|d j dkrld}|dkrt|d | dd d	||d   }|S )
Nr,  r   r   g333333?r   r   c                 S   s   | j S ry   )rn   r   rv   rv   rw   <lambda>      z1CmfSearchEngine.bm25_sort_date4.<locals>.<lambda>)r  )r   r!  sorted)r   r   newresr.   rv   rv   rw   r*    s    $zCmfSearchEngine.bm25_sort_date4c                 C   s.  g }d}t |}|D ]B}|d7 }| ||\|_|_|_| jd| 7  _|| qt|dd dd}|d d D ]4}d	|jkrv| jd
7  _d|jdd|j |_qvt|dd dd}d}|d d D ]R}|d7 }| jd| d| 7  _|j	d |j d|  |_	|j
dd|_
q|d d S )Nr   r   z - rc                 S   s   | j S ry   r)  rH  rv   rv   rw   rI    rJ  z+CmfSearchEngine.bm25_sort.<locals>.<lambda>T)r  reverser<  u   ТОП1r'  u    ТОП1.2fr   c                 S   s   | j S ry   rM  rH  rv   rv   rw   rI  $  rJ  r   z bz - z | tsq=ZDZQTNr   )r   	bm25_rankr)  r!  
rank_debugrj   r   rK  rC   rl   rh   r5  )r}   r   r0  rL  r   ZllrG  rrv   rv   rw   r(    s*    
zCmfSearchEngine.bm25_sortc           -         s   fdd}| j }i } jd k	r8 jdkr8| j}d}d}d}d}	d}
d}d}t }d} j| jkrnd}i }| D ]h}||krqz|| |
d7 }
t }d}d d t g d dt d}|d	D ]}||krq|d
 | |d dd || dD  |d7 }|tj	kr<|tj	| kr<tj	| | }ntj	d | }d}d}t
t|| dd}|||d   ||  }|}d} jdkr||krΈ jr| j kr|	d7 }	d}|d }n|d7 }|
dkr|dkr|d }|| }|d|d  d| d| d|dd|dd|dd| d |dd!| 7 }|| qt|dkr`qz|d7 }t|}| j|
d  d"kr|d# }|d$| j|
d   7 }t|dkr|d%|d&7 }||kr|||< ||d'< ||d(< qzd}d}d}| D ]X\}}d} d}!|d D ]}"d}#| D ]^\}$}%||$kr:q$|%d D ]<}&t|&|" d)krB| d7 } |#d7 }#|d* |$  q$qBq$|#|!kr|#}!q| dkrFd+|! }'|d' |' |d,< d-| d.|! d/|d, dd|d' dd0	| }|d7 }|!|kr|!}|| 7 }t|d* d1dd2dd3d}(d4| d5|( d$  j  _n|d' |d,< q j d$ j  _td6d | D }|d+|  }dd7d#d8 j    })||) }d9| d:| d; jd<d=|)d&d> jd&d?| d@ }d}*tjjjrtjjjdkr jrtjjj jkrdA}*d}+dBtkr> jtj kr>dA}+||* |+ }dC|d&dD| dE|* dF|+d&d$	| }|dkr~dG}|}, jdHkr|dH7 }dI|d&d$| }||,|fS )JNc                    s:    j dd} tdd| } | d d } td|  d S )N'"z":([0-9,]*)z":"\1", z{ z })rg   r5  r   r   orjsonloads)Ztmpr  rv   rw   tsvector2dict4  s    z0CmfSearchEngine.bm25_rank.<locals>.tsvector2dictr   r   r   r=  r>  )rP  
smart_rank
found_synsvector_positionstoken_found_idxr  near_tokensr1  r[  r\  c                 S   s   g | ]}t |qS rv   )intr   r   rv   rv   rw   r   y  s     z-CmfSearchEngine.bm25_rank.<locals>.<listcomp>,r?  g      ?g      ?r   Fr  Tr   r   z wr3  =z bm25t(bm25)=z.1f(z) idf=z fqd=z ord=z	 in_name=)ZVERBZINFNr   r   z
 SYNW_AWG=rO  rP  r]  r  r^  g?rZ  z maxNRC[z]=z smrt(bm25t)=z) rS  {}r   : c                 S   s   g | ]}|d  qS )rZ  rv   r`  rv   rv   rw   r     s     r<     z nearC=z
 sumNearC=z age=.0fz ageF=z	 sqlRank= ()g?FTS_RECENT_PROJECTSz rbm25=z wc=z rur=z rpr=gMbP?r  z code)!rg   r`   r   r@   r   r   r   extendr   r   minr   rp   rC   r   maxr   itemsabsrt   r5  rh   sumvaluesrn   rm   r   r   r   r  rF   rE   rk  )-r}   r  r0  rY  Zdoc_vector_dictZname_vector_dictrm   rQ  Z	w_doc_cntZ
w_name_cntZsynw_cntZorder_factorZfound_tokens_cntZ
done_wordsZ	model_keyZmatched_tokensZsynwZrank_setZw_cntr   r   ZidfkbZfqdZbm25Z	bm25_origZin_nameZw_avgZnear_tokens_cntZsum_near_tokens_cntZmax_of_max_near_contextZhave_near_token_cntZmax_near_contextposZtmp_max_near_contextZsynw2Ztoken2Zpos2Zmltr^  Z
age_factorZrelated_user_rankZrecent_project_rankZ	real_rankrv   rX  rw   rP  2  s    


	".
J




.
$6$&
zCmfSearchEngine.bm25_rankc                 C   s   d| kr| S |  dd S )Nz@#@#@#r   )r   )rH   rv   rv   rw   _remove_suffix_from_headline  s    z,CmfSearchEngine._remove_suffix_from_headlinec                 C   s$  | j | jd | jd  }| jri }g }|D ]}||j |jrN| |jnd}d}|jdk	rj|jd}|j}|j	r|d |j	 }|j|j
|j| d| d	| d	|jd
dtj 	t|j||j|jd||j< q,|t_td |S i }| jri }	|D ]}|	|jg |j q|	 D ]}
tt|
 jr\| jdg }dd|	|
 gdddgg}n| j}dd|	|
 g}|
dkr|dddg }|
dkr|dg }t|
j||| j| jd}|D ]8}tt|j jr|jr|||j j!< n
|||j!< qq"nt"d td g }|D ]}|#|j}|r|jrB| |jnd}d}|jdk	r`|jd}|j}|j	r||d |j	 }|j$dkr|j%sq|j&|j%j'|j%j(j)| d| d	| d	|jd
dtj 	||j|j|j!j)|j
|j*|j+|j,t|jd}n^t-.d|j
d }|j!j)||j(j)| d| d	| d	|jd
dtj 	||j|jt|jd}zl| j/st|j0ddd  n| /|st1| jD ].}|.d!d }t2||j |d}|||< q|| W n t1k
r   Y nX qtd | j3st4tj5j6| j7t8|dd" d#d$ || _9dS )%u5   
        Итоговая обработка
        r   r   r   NoneNz.6fr   z ||| r   rh  z words=)idr<   coderh   rk   rl   rm   rn   zfulltext_search ENDcmf_ver_headINZcmf_ver_curz==Trx  r   urlurl_previewurl_preview_imgr   Zfullsearch_answer)r	   filterZinclude_archivedZinclude_deleteduY   DEV: FATAL. Укажите в запросе поиска список полей fields=z"fulltext_search Start check access)rx  r<   ry  rh   rl   rm   rn   Z	attach_idZattach_nameZ
attach_urlZattach_url_previewZattach_url_preview_imgrk   z DZQTN )rx  r<   ry  rh   rl   rm   rn   rk   )ZTEXKOM_skip_failread_auditZTEXKOM_ppp_project_simplecheckr3  r   )r   obj_dict)r   ):r   r   r   r   rA   rh   rv  rm   rl   ro   rj   rB   rn   r   r   ZjsonrW  rk   fulltext_search_headlinesr  r	   
setdefaultr@   keysvarsr   Zcmf_verr   Zget_model_by_namer   r   r   rz  rx  Z	cmf_alertgetr   parentZ	parent_idr<   ry  r  r|  r}  r~  r   r   r   Z_acl_check_readZCmfPermissionErrorgetattrr   Zschedule_deferred_jobr   Z_do_calc_statisticsr   Z
dumps_dictr   )r}   r   r  Z	result_idrR  rh   Zformated_rankrl   ZobjectsZids_by_modelr   _fieldsZ_filterr  r  r   r  rj   r   attrrv   rv   rw   r     s    

$








$
$



z%CmfSearchEngine._prepare_final_result)r  r   r  c                 C   sj   |D ]`\}}t |D ]N}|s  q|d j|kr:|d q||d  ||d j |d qqd S )Nr   )rangerA   r	  r   r   )r   r  r   r  Zlstr   _rv   rv   rw   r    s    
zCmfSearchEngine._add_if_existsz	list[str])r   c                 C   s0   | sg S t jjj dd| i}dd |D S )Na  
                WITH tree_parents AS (
                    WITH RECURSIVE r AS (
                        SELECT obj_id, obj_code, obj_tree_parent_id
                        FROM cmf_full_search
                        WHERE obj_tree_parent_id = :tree_parent_id

                        UNION

                        SELECT cfs.obj_id, cfs.obj_code, cfs.obj_tree_parent_id
                        FROM cmf_full_search AS cfs
                        JOIN r ON cfs.obj_tree_parent_id = r.obj_id
                    )
                    SELECT obj_id FROM r
                    WHERE r.obj_id IN (SELECT obj_tree_parent_id FROM r)

                    UNION

                    SELECT :tree_parent_id
                )
                SELECT * FROM tree_parents;
            r   c                 S   s   g | ]}|d  qS )r   rv   )r   rR  rv   rv   rw   r     s     z5CmfSearchEngine._get_all_branches.<locals>.<listcomp>)r   r   r@  rA  rB  rC  )r   Zrecordsrv   rv   rw   r     s    z!CmfSearchEngine._get_all_branchesTc              
   C   sX  dt _|dd}tdd|}td|}d}d}d}	|D ]}
t|
dkrPq<t|
dkr|
dkrfq<|
d	kr~|	d|
 7 }	q<|
dkrq<|
d
kr|	d7 }	q<|
dkr|	d7 }	q<t|
dkrq<|
d dkrt|
dkrq<|d|
dd   7 }q<|rq<|
dd}
|d7 }|dkrd}d}|	r,|	d dkr0d}t|
dkr| j|
|d}tt	|t	t
 }t|dkr|	| d|d  7 }	n,t|dkr|	| dd| d 7 }	n |r|dkr q|dkr< qq<q<|rt|dkr|d dkr|dd  }|S |	ddddddd d }	|	rT|	d dkrT|	dd  }	|	rt|	d dkrt|	dd  }	|	r|	d dkr|	dd  }	|	r|	dd  dkr|	d d }	z.tjjj d!d"|	i}t|d d }	W n` tjjk
rF } z<tjjj  tjjj d#d"|i}t|d d }	W 5 d }~X Y nX t|d$|	 |	S )%Nr   zwww.u   [^-A-Za-zА-Яа-я0-9()|&!' ]r   z(,| |&|\||\(|\))r   r   )r   !z()&|)r   r   r1  z |)r   r   &z &r   r  r   r,  F)r1  r  r  )r   z (  | z )r<  )r  r1  z OR z or z AND  & z and z!select to_tsquery('russian', :q);q+select websearch_to_tsquery('russian', :q);z->)r   r   r5  r   r   r   r   prepare_wordr   r   r/  r-  r   r   
CmfSynonymr@  rA  rB  rC  
sqlalchemyexcProgrammingErrorrollbackprint)r   r   r   r   Z
first_wordZsearch_query_allowed_symbtokensZ
word_countZstopsr  r0   ZoperZ	sug_wordstsqueryerv   rv   rw   r     s    

("z"CmfSearchEngine.parse_search_queryc              
   C   sd  d}|  D ]*}d|kr*|d| d 7 }q|d| 7 }q| }|dd}|dd}ztjjj d	d
|i}W nN t	j
jk
r } z,tjjj  tjjj ddti}W 5 d }~X Y nX t|d d }|dddddddddddd}d}|  D ]:}d|kr2|d| 7 }n|ddt| d 7 }qd|  }|S )Nr   r1  ri  rj  r   r  r   Zhhhackskiptirez%select to_tsquery('russian', :query);r0  r  r  r   rS  r  rc  )r   r   r5  r   r  r@  rA  rB  rC  r  r  r  r  r0  r   r-  r   )r   r:   r  r   r  r  r   rv   rv   rw   r   9  s2    4
 zCmfSearchEngine.text_stemmingc                 C   s  t  jd| d7  _|d tjkr0d}tj}n
d}tj}g }t|s||}g }t	|}|rt  jd| d7  _|
| t  jd7  _d}	|D ]}
|	d	kr qHt|
d	krq|
d |d kr|
d
 |d
 krqd|
kr|
dd}
t  jd|
 d7  _|
|
 |	d
7 }	qt  jd|
 d7  _|
|
 |	d
7 }	qg }tjjj dd|i}d}	|D ]\}}|	d
kr qt|d	krqn|d |d ks|d
 |d
 krn|dd}t  jd| d7  _|
| |	d
7 }	qnt|t|B |hB }n|h}t }|D ]D}
t|
d d	 D ]*}||j t  jd|j d7  _q2q||B }t }|rtjjddt||hB gddgdgdd
gd}|D ]\}|jr|jjdd d D ]6}| dd}t  jd| d7  _|| qʐq||B t|B }t|S )Nz|w:rf  r   enruzaddNinjaRevers z, zspellError, r   r   r   r   z	addSpell z
            SELECT
                name, similarity(:word, name) as sim
            FROM cmf_synonym
            WHERE
                :word % name
            ORDER BY "sim" desc
            LIMIT 5;
             rD  zaddSpellTrgm z
normalize r<   r{  r:   Zorderno)r  r	   Zorder_byr   ra  r4  zsynAdd )r   r   stringascii_lettersr   Zdictionary_enZdictionary_ruZdictionary_checkZsuggestZninja_reversr   r   r5  r   r  r@  rA  rB  rC  r   morphparser   Znormal_formr   r:   r  r   r   )r   rD  r   langZ
dictionaryZfiltered_suggestions3ZsuggestionsZfiltered_suggestionsZnwr   r   Zfiltered_suggestions2Zsuggestions2_listZsuggr  Zall_suggestionsZnormalized_wordsZsynonym_wordsZsynonym_listZsynonymr/   rv   rv   rw   r  _  s    


 



	 
$
 zCmfSearchEngine.prepare_word)FNNFNNFFNNN)TFF)T)*rq   rr   rs   __doc__r   r   r   r   r   r  r   classmethodr   r   r   r   staticmethodr   r  r  r  r   r   r   r   r.  r   r   r;  r   r*  r(  rP  rv  r   r   r   r  r   r   r   r  rv   rv   rv   rw   r     s~   #"                 q=
%+, l?


1

3
! G
 #m
%r   )-Zcmf.includeZcmf.util.cmfnlputilr   typingr   r   r   r   r  r8  r6  Zdataclassesr   r   r	   	tracebacksys	itertoolscollectionsr
   Zenchantr  Z	pymorphy3r   r  rV  Zbs4r   ZMorphAnalyzerr  ZRUSSIAN_STOP_WORDSZENGLISH_STOP_WORDSZPROMPT_STOP_WORDS_NORMZTECH_STOP_WORDSZEVA_ARTIFACT_KEYS_STOP_WORDSZEVA_ARTIFACT_VALUES_STOP_WORDSr   r/  ZALLOWED_FIELDSr?   rx   r   r   r   rv   rv   rv   rw   <module>
   sH    3 =DD y +