diff options
Diffstat (limited to 'get-kisa-as-list')
-rwxr-xr-x | get-kisa-as-list | 46 |
1 files changed, 46 insertions, 0 deletions
diff --git a/get-kisa-as-list b/get-kisa-as-list new file mode 100755 index 0000000..a57a95e --- /dev/null +++ b/get-kisa-as-list @@ -0,0 +1,46 @@ +#!/bin/env python3 +import html +import html.parser +import requests + +TARGET_URL = 'https://krnic.kisa.or.kr/jsp/business/management/asList.jsp' + +def sanitize_as_name (s: str): + return ' '.join(s.split()) + +class KISAASListExtractor (html.parser.HTMLParser): + _table_started = False + _td = list[str]() + _tag_stack = list[str]() + + def handle_starttag (self, tag, attrs): + self._tag_stack.append(tag) # FIXME: don't push void elements + + if self._table_started: + if tag == 'tr': + self._td.clear() + else: + if tag == 'table': + attr_map = dict[str, str](attrs) + + if attr_map.get('class') == 'datatable': + self._table_started = True + + def handle_endtag (self, tag): + if self._table_started and tag == 'table': + self._table_started = False + if tag == 'tr' and self._td: + print("%-12s\t%s" % (self._td[1], sanitize_as_name(self._td[0]))) + + self._tag_stack.pop() + + def handle_data (self, data): + if self._table_started and self._tag_stack[-1] == 'td': + self._td.append(data) + + +doc_parser = KISAASListExtractor() + +with requests.get(TARGET_URL) as req: + raw = req.content.decode(req.encoding or 'utf-8') + doc_parser.feed(raw) |