はじめに
PythonでHTMLの
タグ(Tag)/属性(Attribute)/コンテンツ(Contents)/コメント(Comment)/doctype
を解析します。
HTMLを解析する
入力となるHTMLは以下です
<!DOCTYPE html>
<!--Comments-->
<html>
<head>
<title>Title</title>
</head>
<body>
<font size="10" color="red">Hello</font>
<font color="blue">World</font><a href="#">!!</a>
</body>
</html>
タグ
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print("Start tag:", tag)
string = ""
with open("index.html", "r") as f:
for i in f:
string += i
parser = MyHTMLParser()
parser.feed(string)
# 出力
# Start tag: html
# Start tag: head
# Start tag: title
# End tag: title
# End tag: head
# 以下略
属性
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
for attr in attrs:
print("Attr:", attr)
string = ""
with open("index.html", "r") as f:
for i in f:
string += i
parser = MyHTMLParser()
parser.feed(string)
# 出力
# Attr: ('size', '10')
# Attr: (',', None)
# Attr: ('color', 'red')
# Attr: ('color', 'blue')
# Attr: ('href', '#')
コンテンツ
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_data(self, data):
print("Contents:", data)
string = ""
with open("index.html", "r") as f:
for i in f:
string += i.replace(" ", "").replace("\n", "") # 空白と改行を消す
parser = MyHTMLParser()
parser.feed(string)
# 出力
# Contents: Title
# Contents: Hello
# Contents: World
# Contents: !!
コメント
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_comment(self, data):
print("Comment:", data)
string = ""
with open("index.html", "r") as f:
for i in f:
string += i
parser = MyHTMLParser()
parser.feed(string)
# 出力
# Comment: Comments
doctype
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_decl(self, data):
print("Doctype:", data)
string = ""
with open("index.html", "r") as f:
for i in f:
string += i
parser = MyHTMLParser()
parser.feed(string)
# 出力
# Doctype: DOCTYPE html
サンプルコード
あるタグの属性を取得する
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if self.tag == tag:
for attr in attrs:
print("attr:", attr)
def feed(self, data, tag):
self.tag = tag
super().feed(data)
string = ""
with open("index.html", "r") as f:
for i in f:
string += i
parser = MyHTMLParser()
parser.feed(string, "font")
# 出力
# start tag: font
# attr: ('size', '10')
# attr: ('color', 'red')
# start tag: font
# attr: ('color', 'blue')
あるタグのコンテンツを取得する
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, _):
if self.tag == tag:
self.check = True
def handle_data(self, data):
if self.check:
self.check = False
print("Contents:", data)
def feed(self, data, tag):
self.tag = tag
self.check = False
super().feed(data)
string = ""
with open("index.html", "r") as f:
for i in f:
string += i
parser = MyHTMLParser()
parser.feed(string, "font")
# 出力
# Contents: Hello
# Contents: World
ある属性の値を取得する
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
def handle_starttag(self, _, attrs):
if self.attr in dict(attrs).keys():
print(dict(attrs)[self.attr])
def feed(self, data, attr):
self.attr = attr
self.check = False
super().feed(data)
string = ""
with open("index.html", "r") as f:
for i in f:
string += i
parser = MyHTMLParser()
parser.feed(string, "color")
# 出力
# red
# blue
まとめ
PythonでHTMLを解析しました
コメント