U
    kÖ]e9E  ã                   @   sÀ   d Z ddlZddlZddlZddlmZ dgZe d¡Ze d¡Z	e d¡Z
e d¡Ze d	¡Ze d
¡Ze d¡Ze d¡Ze d¡Ze dej¡Ze d
¡Ze d¡ZG dd„ dejƒZdS )zA parser for HTML and XHTML.é    N)ÚunescapeÚ
HTMLParserz[&<]z
&[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z	<[a-zA-Z]ú>z--\s*>z+([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF  
  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  (?:[\s/]*                          # optional whitespace before attribute name
    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      (?:\s*=+\s*                    # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
        \s*                          # possibly followed by a space
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                # trailing whitespace
z#</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>c                   @   sè   e Zd ZdZdZddœdd„Zdd„ Zd	d
„ Zdd„ ZdZ	dd„ Z
dd„ Zdd„ Zdd„ Zdd„ Zd9dd„Zdd„ Zdd„ Zdd „ Zd!d"„ Zd#d$„ Zd%d&„ Zd'd(„ Zd)d*„ Zd+d,„ Zd-d.„ Zd/d0„ Zd1d2„ Zd3d4„ Zd5d6„ Zd7d8„ ZdS ):r   aE  Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  If convert_charrefs is
    True the character references are converted automatically to the
    corresponding Unicode character (and self.handle_data() is no
    longer split in chunks), otherwise they are passed by calling
    self.handle_entityref() or self.handle_charref() with the string
    containing respectively the named or numeric reference as the
    argument.
    )ZscriptZstyleT)Úconvert_charrefsc                C   s   || _ |  ¡  dS )zÆInitialize and reset this instance.

        If convert_charrefs is True (the default), all character references
        are automatically converted to the corresponding Unicode characters.
        N)r   Úreset)Úselfr   © r   ú!/usr/lib/python3.8/html/parser.pyÚ__init__W   s    zHTMLParser.__init__c                 C   s(   d| _ d| _t| _d| _tj | ¡ dS )z1Reset this instance.  Loses all unprocessed data.Ú z???N)ÚrawdataÚlasttagÚinteresting_normalÚinterestingÚ
cdata_elemÚ_markupbaseÚ
ParserBaser   ©r   r   r   r	   r   `   s
    zHTMLParser.resetc                 C   s   | j | | _ |  d¡ dS )z‘Feed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        r   N)r   Úgoahead©r   Údatar   r   r	   Úfeedh   s    zHTMLParser.feedc                 C   s   |   d¡ dS )zHandle any buffered data.é   N)r   r   r   r   r	   Úcloseq   s    zHTMLParser.closeNc                 C   s   | j S )z)Return full source of start tag: '<...>'.)Ú_HTMLParser__starttag_textr   r   r   r	   Úget_starttag_textw   s    zHTMLParser.get_starttag_textc                 C   s$   |  ¡ | _t d| j tj¡| _d S )Nz</\s*%s\s*>)Úlowerr   ÚreÚcompileÚIr   )r   Úelemr   r   r	   Úset_cdata_mode{   s    
zHTMLParser.set_cdata_modec                 C   s   t | _d | _d S ©N)r   r   r   r   r   r   r	   Úclear_cdata_mode   s    zHTMLParser.clear_cdata_modec                 C   sX  | j }d}t|ƒ}||k rè| jrv| jsv| d|¡}|dk r | dt||d ƒ¡}|dkrpt d¡ 	||¡spqè|}n*| j
 	||¡}|r’| ¡ }n| jrœqè|}||k rÞ| jrÌ| jsÌ|  t|||… ƒ¡ n|  |||… ¡ |  ||¡}||kröqè|j}|d|ƒrJt ||¡r"|  |¡}	n†|d|ƒr:|  |¡}	nn|d|ƒrR|  |¡}	nV|d|ƒrj|  |¡}	n>|d	|ƒr‚|  |¡}	n&|d
 |k rè|  d¡ |d
 }	nqè|	dk r<|s¼qè| d|d
 ¡}	|	dk rú| d|d
 ¡}	|	dk r|d
 }	n|	d
7 }	| jr*| js*|  t|||	… ƒ¡ n|  |||	… ¡ |  ||	¡}q|d|ƒrðt ||¡}|r²| ¡ dd… }
|  |
¡ | ¡ }	|d|	d
 ƒs¢|	d
 }	|  ||	¡}qn<d||d … krè|  |||d … ¡ |  ||d ¡}qèq|d|ƒrÚt ||¡}|rP| d
¡}
|  |
¡ | ¡ }	|d|	d
 ƒsB|	d
 }	|  ||	¡}qt ||¡}|rª|rè| ¡ ||d … krè| ¡ }	|	|kr”|}	|  ||d
 ¡}qèn.|d
 |k rè|  d¡ |  ||d
 ¡}nqèqdstdƒ‚q|rF||k rF| jsF| jr(| js(|  t|||… ƒ¡ n|  |||… ¡ |  ||¡}||d … | _ d S )Nr   ú<ú&é"   z[\s;]ú</ú<!--ú<?ú<!r   r   z&#é   éÿÿÿÿú;zinteresting.search() lied)r   Úlenr   r   ÚfindÚrfindÚmaxr   r   Úsearchr   ÚstartÚhandle_datar   Z	updateposÚ
startswithÚstarttagopenÚmatchÚparse_starttagÚparse_endtagÚparse_commentÚparse_piÚparse_html_declarationÚcharrefÚgroupÚhandle_charrefÚendÚ	entityrefÚhandle_entityrefÚ
incompleteÚAssertionError)r   r@   r   ÚiÚnÚjZampposr7   r5   ÚkÚnamer   r   r	   r   †   sÄ    
ÿ
 










zHTMLParser.goaheadc                 C   s¾   | j }|||d … dks"tdƒ‚|||d … dkr@|  |¡S |||d … dkr^|  |¡S |||d …  ¡ d	kr°| d
|d ¡}|dkr’dS |  ||d |… ¡ |d S |  |¡S d S )Nr+   r*   z+unexpected call to parse_html_declaration()é   r(   é   z<![é	   z	<!doctyper   r,   r   )r   rD   r:   Zparse_marked_sectionr   r/   Úhandle_declÚparse_bogus_comment)r   rE   r   Úgtposr   r   r	   r<      s    

z!HTMLParser.parse_html_declarationr   c                 C   s`   | j }|||d … dks"tdƒ‚| d|d ¡}|dkr>dS |rX|  ||d |… ¡ |d S )Nr+   )r*   r'   z"unexpected call to parse_comment()r   r,   r   )r   rD   r/   Úhandle_comment)r   rE   Zreportr   Úposr   r   r	   rN     s    zHTMLParser.parse_bogus_commentc                 C   sd   | j }|||d … dks"tdƒ‚t ||d ¡}|s:dS | ¡ }|  ||d |… ¡ | ¡ }|S )Nr+   r)   zunexpected call to parse_pi()r,   )r   rD   Úpicloser2   r3   Ú	handle_pir@   )r   rE   r   r7   rG   r   r   r	   r;   !  s    zHTMLParser.parse_pic                 C   sì  d | _ |  |¡}|dk r|S | j}|||… | _ g }t ||d ¡}|sPtdƒ‚| ¡ }| d¡ ¡  | _	}||k r.t
 ||¡}|sŠq.| ddd¡\}	}
}|
s¨d }n\|d d… d  krÌ|dd … ksøn |d d… d  krô|dd … krn n|dd… }|rt|ƒ}| |	 ¡ |f¡ | ¡ }ql|||…  ¡ }|d	kr¬|  ¡ \}}d
| j krˆ|| j  d
¡ }t| j ƒ| j  d
¡ }n|t| j ƒ }|  |||… ¡ |S | d¡rÆ|  ||¡ n"|  ||¡ || jkrè|  |¡ |S )Nr   r   z#unexpected call to parse_starttag()r+   rK   ú'r,   ú")r   ú/>Ú
rV   )r   Úcheck_for_whole_start_tagr   Útagfind_tolerantr7   rD   r@   r>   r   r   Úattrfind_tolerantr   ÚappendÚstripZgetposÚcountr.   r0   r4   ÚendswithÚhandle_startendtagÚhandle_starttagÚCDATA_CONTENT_ELEMENTSr!   )r   rE   Úendposr   Úattrsr7   rH   ÚtagÚmÚattrnameÚrestZ	attrvaluer@   ÚlinenoÚoffsetr   r   r	   r8   -  s\    

&
 ÿ
ÿ


ÿ
zHTMLParser.parse_starttagc                 C   s¶   | j }t ||¡}|rª| ¡ }|||d … }|dkr>|d S |dkr~| d|¡rZ|d S | d|¡rjdS ||krv|S |d S |dkrŠdS |dkr–dS ||kr¢|S |d S td	ƒ‚d S )
Nr   r   ú/rV   r+   r,   r   z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzwe should not get here!)r   Úlocatestarttagend_tolerantr7   r@   r5   rD   )r   rE   r   re   rG   Únextr   r   r	   rX   `  s.    z$HTMLParser.check_for_whole_start_tagc                 C   s.  | j }|||d … dks"tdƒ‚t ||d ¡}|s:dS | ¡ }t ||¡}|sÜ| jd k	rr|  |||… ¡ |S t	 ||d ¡}|s¬|||d … dkr¢|d S |  
|¡S | d¡ ¡ }| d| ¡ ¡}|  |¡ |d S | d¡ ¡ }| jd k	r|| jkr|  |||… ¡ |S |  |¡ |  ¡  |S )	Nr+   r'   zunexpected call to parse_endtagr   r,   rK   z</>r   )r   rD   Ú	endendtagr2   r@   Ú
endtagfindr7   r   r4   rY   rN   r>   r   r/   Úhandle_endtagr#   )r   rE   r   r7   rO   Z	namematchZtagnamer    r   r   r	   r9   ‚  s8    



zHTMLParser.parse_endtagc                 C   s   |   ||¡ |  |¡ d S r"   )r`   ro   ©r   rd   rc   r   r   r	   r_   ª  s    zHTMLParser.handle_startendtagc                 C   s   d S r"   r   rp   r   r   r	   r`   ¯  s    zHTMLParser.handle_starttagc                 C   s   d S r"   r   )r   rd   r   r   r	   ro   ³  s    zHTMLParser.handle_endtagc                 C   s   d S r"   r   ©r   rI   r   r   r	   r?   ·  s    zHTMLParser.handle_charrefc                 C   s   d S r"   r   rq   r   r   r	   rB   »  s    zHTMLParser.handle_entityrefc                 C   s   d S r"   r   r   r   r   r	   r4   ¿  s    zHTMLParser.handle_datac                 C   s   d S r"   r   r   r   r   r	   rP   Ã  s    zHTMLParser.handle_commentc                 C   s   d S r"   r   )r   Zdeclr   r   r	   rM   Ç  s    zHTMLParser.handle_declc                 C   s   d S r"   r   r   r   r   r	   rS   Ë  s    zHTMLParser.handle_pic                 C   s   d S r"   r   r   r   r   r	   Úunknown_declÎ  s    zHTMLParser.unknown_declc                 C   s   t jdtdd t|ƒS )NzZThe unescape method is deprecated and will be removed in 3.5, use html.unescape() instead.r+   )Ú
stacklevel)ÚwarningsÚwarnÚDeprecationWarningr   )r   Úsr   r   r	   r   Ò  s
     þzHTMLParser.unescape)r   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__ra   r
   r   r   r   r   r   r!   r#   r   r<   rN   r;   r8   rX   r9   r_   r`   ro   r?   rB   r4   rP   rM   rS   rr   r   r   r   r   r	   r   ?   s8   		z
3"()r{   r   rt   r   Zhtmlr   Ú__all__r   r   rC   rA   r=   r6   rR   ZcommentcloserY   rZ   ÚVERBOSErk   rm   rn   r   r   r   r   r   r	   Ú<module>   s,   








ÿò

