
    hOi}+                        d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
mZ  ej                  e      Zej                  j!                  dd      j#                         dk(  Zej                  j!                  dd      Z eej                  j!                  d	d
            Z eej                  j!                  dd            Z eej                  j!                  dd            Z eej                  j!                  dd            Zdadad Zeeefdedeeeef      dedededeeeef      fdZeddfdedeeeef      dedededeeeef      fdZdeeef   fdZ 	 d(dedeeeef      dedeeeef      fd Z!	 	 	 d)deeeef      d!ed"ededeeeef      f
d#Z"ed$d$dfdedeeeef      ded%e#d&e#dedeeeef      fd'Z$y)*a  
Reranker Module for RAG System
================================
Implements cross-encoder based reranking to improve retrieval precision.

Based on research from "When Retrieval Succeeds and Fails: Rethinking RAG for LLMs"
which recommends reranking as a key step after initial retrieval to filter noise
and improve relevance ordering.

The cross-encoder scores (query, document) pairs jointly, providing more accurate
relevance scores than bi-encoder (embedding) similarity alone.
    N)ListDictAnyOptionalTuple)	lru_cacheRERANKER_ENABLEDtrueRERANKER_MODELz$cross-encoder/ms-marco-MiniLM-L-6-v2RERANKER_TOP_K5RERANKER_CANDIDATES20RERANKER_MIN_SCOREz-2.0RERANKER_WEIGHTz0.4Fc                     t         t         S t        ryda	 ddlm}  t        j                  dt                t        j                         } | t        d      a t        j                  dt        j                         |z
  d	d
       t         S # t        $ r t        j                  d       Y yt        $ r"}t        j                  d|        Y d}~yd}~ww xY w)z"Lazy load the cross-encoder model.NTr   )CrossEncoderzLoading reranker model: i   )
max_lengthzReranker model loaded in z.2fsz8sentence_transformers not installed. Reranking disabled.zFailed to load reranker model: )_reranker_model_model_load_attemptedsentence_transformersr   loggerinfor   timeImportErrorwarning	Exceptionerror)r   startes      ,/var/www/html/leadgen/airagagent/reranker.pyget_reranker_modelr#   #   s     " 6.~.>?@		&~#F/		e0CC/HJK QR 6qc:;s   A8B C0C8CCquery
candidatestop_k	min_scorereranker_weightreturnc           
         |sg S t         st        j                  d       |d| S t               }|t        j                  d       |d| S 	 t	        j                         }g }|D ]O  }|j                  d      xs |j                  dd      }	t        |	      dkD  r|	dd n|	}	|j                  | |	g       Q |j                  |d	      }
g }t        |      D ]  \  }}t        |
|         }||k  rd
d
t        d|       z   z  }|j                  dd      }d
|z
  |z  ||z  z   }|j                         }||d<   ||d<   ||d<   ||d<   |j                  |        |j                  d d       t	        j                         |z
  }t        j                  dt        |       dt        t        |      |       d|dd       |d| S # t         $ r'}t        j#                  d|        |d| cY d}~S d}~ww xY w)a  
    Rerank retrieval candidates using a cross-encoder model.
    
    Args:
        query: The user's search query
        candidates: List of candidate documents from initial retrieval
        top_k: Number of top results to return after reranking
        min_score: Minimum reranker score to include a result
        reranker_weight: Weight of reranker score in final combined score
    
    Returns:
        Reranked list of candidates with updated scores
    z0Reranker disabled, returning original candidatesNz;Reranker model not available, returning original candidatescontenttext i  F)show_progress_bar   g	@score      ?reranker_scorereranker_normalizedoriginal_scorec                     | d   S Nr0    xs    r"   <lambda>z#rerank_candidates.<locals>.<lambda>   s
    AgJ     Tkeyreversez	Reranked z candidates to z in z.3fr   zReranking failed: )r	   r   debugr#   r   getlenappendpredict	enumeratefloatpowcopysortr   minr   r   )r$   r%   r&   r'   r(   model
start_timepairs	candidatedoc_textreranker_scoresrerankedir2   normalized_rerankerr4   combined_scorereranked_candidateelapsedr!   s                       r"   rerank_candidatesrV   >   s?   ( 	GH&5!! E}RS&5!!5"YY[
 # 	,I }}Y/L9==3LH*-h-$*>xHHLL%*+	,  ---G %j1 	0LAy"?1#56N 	) #$q3w+H'H"I ']]7C8N_$6"556 
 "+!13A/08K453A/0*8w'OO.//	04 	.=))+
*iJ0CMSX@Y?ZZ^_fgj^kklmn ")!-.&5!!"s   E9G 	G6G1+G61G6   皙?max_per_sourcediversity_weightc                    |sg S t        | |t        |            }|s|d| S g }i }|D ]  }|j                  di       j                  dd      }	|j                  |	d      }
|
|k\  r|dxx   d|z
  z  cc<   |
dz   ||	<   |j                  |       t        |D cg c]:  }|j                  |j                  di       j                  dd      d      |k  s9|< c}      |k\  s n |j	                  d	 d
       |d| S c c}w )z
    Rerank candidates with diversity constraints to avoid over-representation
    from single sources.
    
    This implements the MMR (Maximal Marginal Relevance) concept to balance
    relevance with diversity.
    r&   Nmetadatasourceunknownr   r0   r/   c                     | d   S r6   r7   r8   s    r"   r:   z'rerank_with_diversity.<locals>.<lambda>   
    '
 r;   Tr<   )rV   rA   r@   rB   rH   )r$   r%   r&   rY   rZ   rP   selectedsource_countsrM   r^   current_countcs               r"   rerank_with_diversityrf      s,    	 !
#j/JH&5!! HM 	z2.228YG%))&!4N*g1'7#78 - 1f	"8~a}'8'8z29N9R9RS[]f9gij'ko}'}~  DI  I MM*DM9FU s   :C=
C=
c                  v    t         r
t               nd} t         | rt        nd| dut        t        t
        t        dS )z*Get status information about the reranker.N)enabledrJ   model_loadedr&   r%   r'   weight)r	   r#   r   r   r   r   r   )rJ   s    r"   get_reranker_statusrk      s9    $4 $E $#(dT))'! r;   c                     ddl }| j                         }t        |j                  d|            }g }|D ]  }|j	                  d      xs |j	                  dd      j                         t        fd|D              }|r|t        |      z  nd}	|j	                  dd	      }
|
d
|	z  z   }|j                         }|	|d<   |
|d<   t        |d      |d<   |j                  |        |j                  d d       |d| S )zm
    Simple keyword-based reranking boost.
    Useful as a fallback when cross-encoder is not available.
    r   Nz
\b\w{4,}\br+   r,   r-   c              3   ,   K   | ]  }|v sd   yw)r/   Nr7   ).0wordr+   s     r"   	<genexpr>z'keyword_boost_rerank.<locals>.<genexpr>   s     CD47?aCs   	r0   r1   g?keyword_match_ratior4         ?c                     | d   S r6   r7   r8   s    r"   r:   z&keyword_boost_rerank.<locals>.<lambda>   ra   r;   Tr<   )relowersetfindallr@   sumrA   rG   rI   rB   rH   )r$   r%   r&   rt   query_lowerquery_wordsrP   rM   matchesmatch_ratior4   boosted_scorerT   r+   s                @r"   keyword_boost_rerankr~      s    ++-Kbjj<=KH ,	==+Hy}}VR/HOOQ CKCC4?gK 00Q #w4&#*;<&^^-4?01/=+,&)-&=7#*+," MM*DM9FUr;   ideal_lengthpenalty_factorc                    g }| D ]  }|j                  d      xs |j                  dd      }t        |      }t        ||z
        |z  }|t        |d      z  }	|j                  dd      }
|
d|	z
  z  }|j	                         }||d<   |	|d	<   |
|d
<   ||d<   |j                  |        |j                  d d       |d| S )z{
    Apply length penalty to favor chunks of optimal size.
    Too short = less context, too long = diluted relevance.
    r+   r,   r-   rr   r0   r1   r/   lengthlength_penaltyr4   c                     | d   S r6   r7   r8   s    r"   r:   z'length_penalty_rerank.<locals>.<lambda>  ra   r;   Tr<   N)r@   rA   absrI   rG   rB   rH   )r%   r   r   r&   rP   rM   r+   r   length_diffpenaltyr4   penalized_scorerT   s                r"   length_penalty_rerankr      s     H ,	--	*GimmFB.GW &</0<? 3{C#88"w4(AK8&^^-'-8$/6+,/=+,&57#*+!,$ MM*DM9FUr;   Tuse_cross_encoderuse_diversityc           	          |sg S |}|rOt         rIt               }|r&t        | |t        t	        |      t
                    }nt        | |t	        |            }|rt        | |||      }|S |d| }|S )z
    Full reranking pipeline combining multiple techniques.
    
    Pipeline:
    1. Cross-encoder reranking (if available and enabled)
    2. Diversity selection (if enabled)
    3. Final top-k selection
    r\   )r&   rY   N)r	   r#   rV   rI   rA   r   r~   rf   )r$   r%   r&   r   r   rY   resultrJ   s           r"   full_rerank_pipeliner     s      	F -"$&ufCFM`<abF *%s6{KF &6)
 M Mr;   )   )i   rX   r   )%__doc__osr   loggingtypingr   r   r   r   r   	functoolsr   	getLogger__name__r   environr@   ru   r	   r   intr   r   rE   r   r   r   r   r#   strrV   rf   rk   r~   r   boolr   r7   r;   r"   <module>r      s   
   3 3 			8	$ ::>>"4f=CCEO  02XYRZZ^^$4c:;"**..)>EF 2::>>*>GH 

'8%@A  <  ),U"U"T#s(^$U" U" 	U"
 U" 
$sCx.U"v  !,,T#s(^$, , 	,
 , 
$sCx.,^T#s(^ ( !!T#s(^$! ! 
$sCx.	!L 	T#s(^$  	
 
$sCx.L  "((T#s(^$( ( 	(
 ( ( 
$sCx.(r;   