Refactor find_redundant to fix #201

This commit is contained in:
Dazzy Ding 2019-05-08 11:02:39 +08:00 committed by Felix Yan
parent f7e27452b6
commit a29cf774e9
1 changed files with 52 additions and 35 deletions

View File

@ -1,47 +1,64 @@
#!/usr/bin/env python3
''' Find accelerated-domains.china.conf for redundant item.
''' Find redundant items in accelerated-domains.china.conf.
e.g. 'bar.foo.com' is redundant for 'foo.com'.
'''
LEAF = 1
def main():
with open('accelerated-domains.china.conf', 'r') as f:
lines = f.readlines()
def load(conf_file):
''' Parse conf file & Prepare data structure
Returns: [ ['abc', 'com'],
['bar', 'foo', 'com'],
... ]
'''
# Parse conf file & prepare data structure
data = {}
for line in lines:
if line == '' or line.startswith('#'):
continue
domain = line.split('/')[1].lower()
labels = domain.split('.')
labels.reverse()
if domain in data:
print(f"Redundant found: {domain}")
data[domain] = labels
domains = list(data.keys())
domains.sort(key=lambda k: len(data[k]))
results = []
with open(conf_file, 'r') as f:
for line in f.readlines():
line = line.strip()
if line == '' or line.startswith('#'):
continue
# A domain name is case-insensitive and
# consists of several labels, separated by a full stop
domain_name = line.split('/')[1]
domain_name = domain_name.lower()
domain_labels = domain_name.split('.')
results.append(domain_labels)
# Sort results by domain labels' length
results.sort(key=len)
return results
def find(labelses):
''' Find redundant items by a tree of top-level domain label to sub-level.
`tree` is like { 'com': { 'foo: { 'bar': LEAF },
'abc': LEAF },
'org': ... }
'''
tree = {}
for domain in domains:
labels = data[domain]
node = tree # Init current node with root node
for i, label in enumerate(labels):
isLastLabel = i + 1 == len(labels)
# Check whether redundant
if (node == LEAF) or (isLastLabel and label in node):
print(f"Redundant found: {domain}")
break
# Create leaf node
if isLastLabel:
node[label] = LEAF
break
# Create branch node
if label not in node:
node[label] = {}
LEAF = 1
for labels in labelses:
domain = '.'.join(labels)
# Init root node as current node
node = tree
while len(labels) > 0:
label = labels.pop()
if label in node:
# If child node is a LEAF node,
# current domain must be an existed domain or a subdomain of an existed.
if node[label] == LEAF:
print(f"Redundant found: {domain} at {label}")
break
else:
# Create a leaf node if current label is last one
if len(labels) == 0:
node[label] = LEAF
# Create a branch node
else:
node[label] = {}
# Iterate to child node
node = node[label]
if __name__ == '__main__':
main()
find(load('accelerated-domains.china.conf'))