
    ݾ{iLZ                         d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlZ	 ddlZdZ	 ddlZdZ G d d	      Zy# e$ r dZY w xY w# e$ r dZY !w xY w)
zg
Case Document Processor for AI Lawyer
Handles case-specific document upload, processing, and chunking
    N)Path)DictListOptionalTuple)datetimeTFc                   R   e Zd ZdZg dg dg dg dg dg dg dg d	g d
g dd
Zg dg dg dg dg dg dg ddZd5dededefdZ	 d5de	dedede
fdZdedefdZd ed!edeeee
   f   fd"Zd edeeee
   f   fd#Zd edeeee
   f   fd$Zded%edefd&Zdedefd'Zd%edede
fd(Zd%eded)ee
   dee
   fd*Zd%eded)ee
   dee
   fd+Zd%ed)ee
   dee
   fd,Zd%ed)ee
   dee
   fd-Z	 d6d%ed)ee
   d.ed/edee
   f
d0Zd%ed)ee
   d1edee
   fd2Zd3ed)ee
   defd4Zy)7CaseDocumentProcessorz+Process documents for a specific legal case)	complaintpetition	plaintiffzcause of action	wherefore)answer	defendantdeniesadmitszaffirmative defense)motion
memorandumzpoints and authorities
oppositionreply)interrogatoryzrequest for productionzrequest for admissionsubpoena)
depositionzq.za.witnesszsworn testimonyzdirect examination)	agreementcontractzparties agreewhereas
witnessethconsideration)orderzit is hereby orderedjudgmentdecreeruling)briefargumentzissue presentedzstatement of facts
conclusion)declarationzdeclare under penaltysworn	affidavit)exhibit
attachmentappendix)
r   r   r   	discoveryr   r   r    r$   r'   r*   )r   r   counterclaimzcross-claimr   )r   r$   r   r   reply_brief)r   request_productionrequest_admissionr   r   )r*   r'   r)   evidence)r    r!   r#   r"   )r   r   leaselicense)letteremailmemo)	pleadingsmotionsr-   r2   orders	contractscorrespondenceN	case_uuiduser_id	base_pathc                     || _         || _        |t        t              j                  dz  dz  }t        |      t        |      z  |z  | _        | j                  j                  dd       y )Ndata	case_docsT)parentsexist_ok)r=   r>   r   __file__parentstrcase_dirmkdir)selfr=   r>   r?   s       9/var/www/html/eventheodds/ai-lawyer-rag/case_processor.py__init__zCaseDocumentProcessor.__init__8   s_    "X--6DIY#g,6BD48    file_contentfilenamedocument_typereturnc                    t        j                  |      j                         }t        |      j                  j                         }| d| j                  |       }| j                  |z  }t        |d      5 }|j                  |       ddd       | j                  ||      \  }	}
|	sdd|dS |r|dk(  r| j                  ||	      }| j                  |      }| j                  |	|      }| j                  |	||
      }d|||t        |      |j!                  d	      ||t        |	      |
rt        |
      nd
|t        |      |dS # 1 sw Y   xY w)a
  
        Process an uploaded legal document

        Args:
            file_content: Raw file bytes
            filename: Original filename
            document_type: Optional document type override

        Returns:
            Dict with processing results
        _wbNFz$Could not extract text from document)successerrorrO   autoT.   )rU   rO   original_filename	file_hash	file_size	file_typerP   categorytext_length
page_countchunkschunk_countmetadata)hashlibmd5	hexdigestr   suffixlower_sanitize_filenamerH   openwrite_extract_text_classify_document_get_category_extract_legal_metadata_create_legal_chunkslenlstrip)rJ   rN   rO   rP   r[   extsafe_filenamefilepathftext	page_infor^   rc   ra   s                 rK   process_uploadz$CaseDocumentProcessor.process_uploadB   sf    KK-779	 8n##))+$+Qt'>'>x'H&IJ===0 (D! 	"QGGL!	" ,,Xs;i ?$   7 33HdCM %%m4 //mD **4	J %!)"\*C* t9,5#i.1v; 
 	
5	" 	"s   8EEc                     t         j                  j                  |      }t        j                  dd|      }t        |      dkD  r6t         j                  j                  |      \  }}|ddt        |      z
   |z   }|S )z"Sanitize filename for safe storagez
[^\w\-_\.]rS      N)ospathbasenameresubrq   splitext)rJ   rO   namers   s       rK   ri   z(CaseDocumentProcessor._sanitize_filename   sl     77##H-66-h7x=3((2ID#MSS\*S0HrM   ru   rs   c                    d}g }|dk(  r| j                  |      \  }}||fS |dv r| j                  |      \  }}||fS |dk(  r<t        |ddd      5 }|j                         }d	d	d	       d
dt	        |      dg}||fS 	 t        |ddd      5 }|j                         }d	d	d	       d
dt	        |      dg}||fS # 1 sw Y   ZxY w# 1 sw Y   )xY w#  Y ||fS xY w)z0Extract text from document with page information z.pdf)z.docxz.docz.txtrzutf-8ignore)encodingerrorsNrY   r   pagestartend)_extract_pdf_extract_docxrj   readrq   )rJ   ru   rs   rw   rx   rv   s         rK   rl   z#CaseDocumentProcessor._extract_text   s   	&="//9OD)  Y %%"00:OD) Y F]hghG  1vvx "#aD	BCI Y(C'(K $q668D$&'!CIFG	 Y   $ $Ys0   B< C C C <CCC Cc                    d}g }d}t         r	 t        j                  |      }t        t	        |            D ]K  }||   }|j                         }|}	|d|dz    d| dz  }t	        |      }|j                  |dz   |	|d       M |j                          ||fS t        r	 t        |d
      5 }t        j                  |      }t        |j                        D ]M  \  }}|j                         xs d}|}	|d|dz    d| dz  }t	        |      }|j                  |dz   |	|d       O 	 d	d	d	       ||fS ||fS # t        $ r}
t        d|
        Y d	}
~
d	}
~
ww xY w# 1 sw Y   5xY w# t        $ r}
t        d|
        Y d	}
~
||fS d	}
~
ww xY w)z(Extract text from PDF with page trackingr   r   z
[PAGE rY   z]

r   zPyMuPDF extraction failed: NrbzPyPDF2 extraction failed: )HAS_PYMUPDFfitzrj   rangerq   get_textappendclose	Exceptionprint
HAS_PYPDF2PyPDF2	PdfReader	enumeratepagesextract_text)rJ   ru   rw   rx   current_posdocpage_numr   	page_text	start_poserv   readers                rK   r   z"CaseDocumentProcessor._extract_pdf   s   	9ii) %c#h Hx=D $I +Ihx!|nC	{"EED"%d)K$$ (1!**&  		Y& 8(D) Q#--a0F*3FLL*A $$($5$5$7$=2	$/	(8a<.I;b II&)$i!(($,qL%.#.*  Y& Y/  93A37889
   821#677Y8sI   B
D8  E( ,A;E(E( 8	EEEE%!E( (	F1FFc                 
   	 ddl m}  ||      }|j                  D cg c]  }|j                   }}dj	                  |      }ddt        |      dg}||fS c c}w # t        $ r}t        d|        dg fcY d}~S d}~ww xY w)	zExtract text from DOCXr   )Document

rY   r   zDOCX extraction failed: r   N)docxr   
paragraphsrw   joinrq   r   r   )	rJ   ru   r   r   pr   rw   rx   r   s	            rK   r   z#CaseDocumentProcessor._extract_docx   s    		%8$C*-..9Q!&&9J9;;z*D"#aD	BCI?" :  	,QC01r6M	s-   A A&A A 	B&A=7B=Brw   c                 j  	 |j                         }|dd j                         	| j                  j                         D ]  \  }}|D ]  }||v s|c c S   i }| j                  j                         D ]$  \  }}t        	fd|D              }|dkD  s |||<   & |rt	        ||j
                        S y)z4Classify document type based on filename and contentNi  c              3   ,   K   | ]  }|v sd   yw)rY   N ).0r   
text_lowers     rK   	<genexpr>z;CaseDocumentProcessor._classify_document.<locals>.<genexpr>   s     ?aqJ?s   	r   keygeneral_document)rh   DOCUMENT_PATTERNSitemssummaxget)
rJ   rO   rw   filename_lowerdoc_typepatternspatternscoresscorer   s
            @rK   rm   z(CaseDocumentProcessor._classify_document   s    !)%4[&&(
 #'"8"8">">"@ 	$Hh# $n,#O$	$ "&"8"8">">"@ 	)Hh?8??Eqy#(x 	)
 v6::..!rM   c                     | j                   j                         D ]!  \  }}|v st        fd|D              s|c S  y)z$Get the category for a document typec              3   &   K   | ]  }|v  
 y wNr   )r   trP   s     rK   r   z6CaseDocumentProcessor._get_category.<locals>.<genexpr>   s     ,OAQ--?,Os   other)
CATEGORIESr   any)rJ   rP   r^   typess    `  rK   rn   z#CaseDocumentProcessor._get_category   sB    #446 	 OHe%,O,O)O	  rM   c                 \   i g ddg g d}|dd }g d}|D ]M  }t        j                  ||t         j                        }|s+|j                  d      j	                         |d<    n g d}|D ]M  }t        j                  ||t         j                        }|s+|j                  d      j	                         |d<    n g d	}	|	D ]  \  }}
t        j                  ||      }|s|
|d
   vs'|j                  d      j	                         }t        j
                  dd|      }|j                  d      }t        |      dk  s}||d
   |
<    g d}|D ]/  }t        j                  ||      }|d   j                  |dd        1 |dv rcg d}|D ]Z  }t        j                  ||t         j                        }|d   j                  |dd D cg c]  }|j	                          c}       \ g d}|D ]7  }|j                         |j                         v s$|d   j                  |       9 |S c c}w )z)Extract legal metadata from document textN)partiesdatescase_numbercourtclaims	key_termsi'  )z*Case\s*(?:No\.?|Number:?)\s*([A-Z0-9\-:]+)z>(?:Civil|Criminal)\s*(?:No\.?|Action\s*No\.?)\s*([A-Z0-9\-:]+)z,Docket\s*(?:No\.?|Number:?)\s*([A-Z0-9\-:]+)rY   r   )z?(?:IN\s+THE\s+)?(?:UNITED\s+STATES\s+)?(DISTRICT\s+COURT[^\n]+)z((?:IN\s+THE\s+)?(SUPERIOR\s+COURT[^\n]+)z'(?:IN\s+THE\s+)?(CIRCUIT\s+COURT[^\n]+)z-(?:IN\s+THE\s+)?(COURT\s+OF\s+APPEALS?[^\n]+)r   ))z#([A-Z][A-Za-z\s,\.]+),?\s*Plaintiffr   )z#([A-Z][A-Za-z\s,\.]+),?\s*Defendantr   )z$Plaintiff[:\s]+([A-Z][A-Za-z\s,\.]+)r   )z$Defendant[:\s]+([A-Z][A-Za-z\s,\.]+)r   r   z\s+ z,.d   )z\b(\d{1,2}/\d{1,2}/\d{2,4})\bz\b(\d{1,2}-\d{1,2}-\d{2,4})\bz%\b([A-Z][a-z]+\s+\d{1,2},?\s+\d{4})\br   
   )r   r   )zQ(?:FIRST|SECOND|THIRD|FOURTH|FIFTH)\s+(?:CAUSE\s+OF\s+ACTION|CLAIM)[:\s]+([^\n]+)z5COUNT\s+(?:I|II|III|IV|V|ONE|TWO|THREE)[:\s]+([^\n]+)zP(?:FOR|CLAIM\s+FOR)\s+(BREACH\s+OF\s+CONTRACT|NEGLIGENCE|FRAUD|[A-Z][A-Za-z\s]+)r   )zbreach of contract
negligencefraudmisrepresentationzbreach of fiduciary dutyzunjust enrichment
conversion
defamationzintentional inflictionzstrict liabilityzsummary judgmentzdefault judgmentzpreliminary injunctionztemporary restraining orderzclass actionzderivative actionr   )r   search
IGNORECASEgroupstripr   rstriprq   findallextendrh   r   )rJ   rw   rP   rc   
text_uppercase_patternsr   matchcourt_patternsparty_patterns
party_type
party_namedate_patternsmatchesclaim_patternsmlegal_termsterms                     rK   ro   z-CaseDocumentProcessor._extract_legal_metadata  sN    
 &5\


 % 	GIIgz2==AE*/++a.*>*>*@'		
 & 	GIIgz2==AE$)KKN$8$8$:!		
 $2 	AGZIIgz2E8I+>>"[[^113
VVFC<
'..t4
z?S(6@HY'
3	A

 % 	3Gjj*5GW$$WSb\2	3
 55N
 * M**Wj"--H"))gcrl*K1779*KLM

   	3Dzz|z//11%,,T2	3  +Ls   H)
rx   c                    g }|dv r| j                  |||      }nB|dk(  r| j                  ||      }n*|dk(  r| j                  ||      }n| j                  ||      }t	        |      D ]  \  }}||d<   ||d<   | j
                  |d<     |S )z-Create intelligent chunks for legal documents)r   r   r   r$   r   r   chunk_indexrP   r=   )_chunk_by_sections_chunk_contract_chunk_deposition_semantic_chunkr   r=   )rJ   rw   rP   rx   ra   ichunks          rK   rp   z*CaseDocumentProcessor._create_legal_chunks\  s     FF,,T=)LFj())$	:Fl*++D)<F ))$	:F "&) 	0HAu#$E- %2E/"!%E+	0
 rM   c                    g }g d}dg}|D ]j  }t        j                  ||t         j                        D ]@  }|j                  |j	                         |j                  d      j                         f       B l |j                  d        |j                  t        |      df       t        t        |      dz
        D ]  }	||	   d   }
||	dz      d   }||	   d   }||
| j                         }t        |      dk  r@| j                  |
|      }t        |      d	kD  rt| j                  |||
      }t        |      D ]R  \  }}|j                  |d
   ||dz   |j                  d|      |
|j                  dd      z   t        |d
         d       T |j                  ||||
t        |      d        t        |      dk  r| j                  ||      S |S )z/Chunk legal documents by their natural sections)zw\n\s*((?:FIRST|SECOND|THIRD|FOURTH|FIFTH|SIXTH|SEVENTH|EIGHTH|NINTH|TENTH)\s+(?:CAUSE\s+OF\s+ACTION|CLAIM|COUNT)[^\n]*)zO\n\s*(COUNT\s+(?:I|II|III|IV|V|VI|VII|VIII|IX|X|ONE|TWO|THREE|FOUR|FIVE)[^\n]*)z9\n\s*((?:I|II|III|IV|V|VI|VII|VIII|IX|X)\.\s+[A-Z][^\n]+)z!\n\s*(JURISDICTION\s+AND\s+VENUE)z\n\s*(PARTIES)z,\n\s*(FACTUAL\s+(?:ALLEGATIONS?|BACKGROUND))z,\n\s*(STATEMENT\s+OF\s+(?:FACTS|THE\s+CASE))z\n\s*(ARGUMENT|DISCUSSION)z\n\s*(CONCLUSION)z\n\s*(PRAYER\s+FOR\s+RELIEF)z\n\s*(WHEREFORE))r   	BEGINNINGrY   c                     | d   S Nr   r   xs    rK   <lambda>z:CaseDocumentProcessor._chunk_by_sections.<locals>.<lambda>  
    ad rM   r   ENDr   2   i  contentr   r   )r   section
subsectionr   
start_charcontent_lengthr   r  r   r  r  )r   finditerr   r   r   r   r   sortrq   r   _get_page_for_position_split_long_sectionr   r   r   )rJ   rw   rP   rx   ra   section_patterns
boundariesr   r   r   r   end_possection_namesection_textr   
sub_chunksj	sub_chunks                     rK   r   z(CaseDocumentProcessor._chunk_by_sectionsv  s     
 ''
' 	KGWdBMMB K!!5;;=%++a.2F2F2H"IJK	K
 	N+3t9e,- s:*+  	A"1a(I Q'*G%a=+L	'288:L< 2% 229iHH < 4'!55lIyY
$-j$9 LAyMM#,Y#7#/&'!e )fh ?&/)--2K&K*-i	.B*C#  ++$"+&),&7 5 	F v;!''i88rM   c                    g }g d}dg}|D ]o  }t        j                  ||t         j                        D ]E  }|j                  d      j	                         }|j                  |j                         |dd f       G q |j                  d        |j                  t        |      df       t        t        |      d	z
        D ]~  }	||	   d   }
||	d	z      d   }||	   d	   }||
| j	                         }t        |      d
k  r@| j                  |
|      }|j                  |dd |||
t        t        |      d      d        t        |      d	k  r| j                  ||      S |S )z&Chunk contract by sections and clauses)z \n\s*(\d+\.)\s+([A-Z][A-Z\s]+)\nz+\n\s*(ARTICLE\s+[IVXLC\d]+)[:\.\s]+([^\n]+)z$\n\s*(SECTION\s+\d+)[:\.\s]+([^\n]+)z"\n\s*(RECITALS|WHEREAS|WITNESSETH)z\\n\s*(DEFINITIONS|TERM|PAYMENT|TERMINATION|CONFIDENTIAL|INDEMNIF|LIMITATION|GOVERNING\s+LAW))r   PREAMBLEr   Nr   c                     | d   S r   r   r   s    rK   r   z7CaseDocumentProcessor._chunk_contract.<locals>.<lambda>  r   rM   r   r   rY      i  r  )r   r  r   r   r   r   r   r  rq   r   r  minr   )rJ   rw   rx   ra   r   r  r   r   r  r   r   r  r  r   s                 rK   r   z%CaseDocumentProcessor._chunk_contract  s   
 &&
 	GGWdBMMB G${{1~335!!5;;=,t2D"EFG	G
 	N+3t9e,-s:*+ 	A"1a(I Q'*G%a=+L	'288:L< 2%229iHHMM'.' '"%c,&7"> 	& v;!''i88rM   c           
      @   g }d}t        t        j                  ||t        j                              }|rd}d}|D ]  }|j	                  d      }	t        |      t        |	      z   dkD  rU|r@| j                  ||      }
|j                  |j                         d|
|t        |      d       |	}|j                         }|d|	z   z  } |r@| j                  ||      }
|j                  |j                         d|
|t        |      d       |s| j                  ||      S |S )z!Chunk deposition by Q&A exchangeszO(Q\.?\s+[^\n]+(?:\n(?![QA]\.)[^\n]+)*)\s*(A\.?\s+[^\n]+(?:\n(?![QA]\.)[^\n]+)*)r   r     zQ&A Exchanger  r   )listr   r  	MULTILINEr   rq   r  r   r   r   r   )rJ   rw   rx   ra   
qa_patternr   current_chunkchunk_startr   qa_textr   s              rK   r   z'CaseDocumentProcessor._chunk_deposition  s/    h
r{{:tR\\BCMK  6++a.}%G4t;$#'#>#>{I#V'4':':'<'5$,*5.1-.@'  %,M"'++-K!Vg%55M!6$ 66{IN,224-$"-&)-&8  ''i88rM   
chunk_sizeoverlapc           
      l   g }t        j                  d|      }d}d}d}	|D ]  }
|
j                         }
|
s|	dz  }	t        |      t        |
      z   |kD  rl|r@| j	                  ||      }|j                  |j                         d||t        |      d       t        |      |kD  r|| d nd}||
z   }|	t        |      z
  }n|r	|d|
z   z  }n|
}|	}|	t        |
      dz   z  }	 |j                         r@| j	                  ||      }|j                  |j                         d||t        |      d       |S )	z'Default semantic chunking by paragraphsz\n\s*\nr   r      r   r  Nr   )r   splitr   rq   r  r   )rJ   rw   rx   r  r   ra   r   r  r  r   parar   overlap_texts                rK   r   z%CaseDocumentProcessor._semantic_chunk  sh     XXj$/
 	)D::<Dq =!CI-
: #::;	RHMM#0#6#6#8#, (&1*-m*<#  <?};MPW;W}gXY7]_ ,t 3)C,== !Vd]2M$(M"-K3t9q=(K9	)>  22;	JHMM(..0$ )"%m"4  rM   base_posc                 ,    | j                  ||dd      S )z(Split a long section into smaller chunksr     )r  r   )r   )rJ   rw   rx   r&  s       rK   r	  z)CaseDocumentProcessor._split_long_sectionP  s     ##D)c#RRrM   posc                 d    |sy|D ]  }|d   |cxk  r	|d   k  sn |d   c S  |r|d   d   S dS )z,Get the page number for a character positionrY   r   r   r   r   )rJ   r)  rx   infos       rK   r  z,CaseDocumentProcessor._get_page_for_positionU  sN     	$DG}1d5k1F|#	$ )2y}V$8q8rM   r   )i  r   )__name__
__module____qualname____doc__r   r   rG   intrL   bytesr   ry   ri   r   r   r   rl   r   r   rm   rn   ro   rp   r   r   r   r   r	  r  r   rM   rK   r
   r
      s   5 \T[edhR]U8 UQkG;B5J9# 9 9 9 -1>
5 >
C >
&)>
59>
@
3 
3 
d  sDJ9O 0/T /eCdO.D /bd uS$t*_/E "3 "c "c ".3 3 XC X X Xt S '+Dz6:4j4Es E3 E%)$ZE48JEN,C ,DJ ,4: ,\,c ,d4j ,T$Z ,^ ?B5C 5DJ 5#&58;5FJ4j5nS ST
 S%(S-1$ZS
	9# 	9$t* 	9 	9rM   r
   )r0  r|   rd   r   pathlibr   typingr   r   r   r   r   jsonr   r   ImportErrorr   r   r
   r   rM   rK   <module>r7     su   
 
  	  . .  JK
B	9 B	9  J  Ks"   A A AAAA