Script to get filename stats
I wanted to generate word statistics for a big amount of files, so I wrote this script. What it does is to remove special characters from filenames, and to split the filename into words. The extension is thrown away.
#!/usr/bin/env python
import argparse
import configparser
import os
import pathlib
import re
import argcomplete
mincount = 10
def getconf():
config = configparser.ConfigParser()
configfile = os.path.join(pathlib.Path.home(), ".config", "filenamestat.ini")
if not os.path.isfile(configfile):
config["DEFAULT"]["minlen"] = "3"
config["DEFAULT"]["mincount"] = "10"
config["DEFAULT"]["stopwords"] = "and,the,for"
with open(configfile, "w") as fh:
config.write(fh)
return config
config.read(configfile)
return config
def filenamestats(paths):
"""
Finds all files in a directory and returns name stats
:param path: str, path
:returns: dict where the keys are found words and the values are the
occurences.
"""
words = {}
for path in paths:
for directory in os.walk(path):
for filename in directory[2]:
# Remove extension
filename = os.path.splitext(filename.strip())[0].lower()
# Remove non-alphanumeric
filename = re.sub(r"[\W_]+", " ", filename)
# Remove excessive whitespace
filename = re.sub(r"\s+", " ", filename)
filenameparts = filename.split(" ")
for word in filenameparts:
words.setdefault(word, 0)
words[word] += 1
return words
def dumpstats(words, stopwords=None, minlen=3, mincount=mincount):
stopwords = stopwords or []
words = sorted(words.items(), key=lambda x: x[1], reverse=True)
for (word, count) in words:
if word in stopwords:
continue
if len(word) >= minlen and count >= mincount:
print(f"{word}: {count}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Filename stats")
parser.add_argument("paths", type=str, nargs="*", default=".", help="Path to search")
parser.add_argument(
"--mincount",
"-m",
type=int,
default=mincount,
help=f"Show only results with that many matches ({mincount})",
)
parser.add_argument(
"--stopwords",
"-s",
nargs="*",
default=[],
help="Ignore those words",
)
args = parser.parse_args()
argcomplete.autocomplete(parser)
config = getconf()
stopwords = args.stopwords + config["DEFAULT"]["stopwords"].split(",")
mincount = args.mincount or int(config["DEFAULT"]["mincount"])
words = filenamestats(args.paths)
dumpstats(
words,
stopwords=stopwords,
minlen=int(config["DEFAULT"]["minlen"]),
mincount=mincount,
)
0 comments
Reply