{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "7e32a042",
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"......\n",
"fixed bad single file torrent 4f269d8aefd647ee270842d53ec98aebd23a4afe\n",
"fixed bad single file torrent 7b09ae0b612dafc1744562dccbbe4becf4d633c3\n",
"47843 @ 78.78622500100755 s\n"
]
}
],
"source": [
"from time import monotonic\n",
"from sys import path\n",
"from os import getenv\n",
"path.append(getenv(\"HOME\") + \"/projects/travnik\")\n",
"from travnik import glob\n",
"print(\"......\")\n",
"start = monotonic()\n",
"torrents = glob(\"/var/opt/travnik\")\n",
"print(len(torrents), \"@\", monotonic()-start, \"s\")\n",
"# t = Torrent()\n",
"# t.file(\"/root/projects/travnik/449a38ef7e042bd2d75e8921aa02f6f244165d9d.torrent\")\n",
"# print(t.sha1.hex())\n",
"# for path, length in t.paths():\n",
"# print(path, length)\n",
"# print(t)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "978ab1cf",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"...\n"
]
}
],
"source": [
"from travnik import Type\n",
"for hash, torrent in torrents.items():\n",
" if torrent.type == Type.HYBRID and not torrent.dict.get(b'info').get(b'meta version'):\n",
" print(torrent.sha1.hex(), torrent.sha256.hex())\n",
"print(\"...\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a4419e5e",
"metadata": {},
"outputs": [],
"source": [
"s = monotonic()\n",
"prej = None\n",
"skup = 0\n",
"dat = 0\n",
"vel = 0\n",
"for torrent in sorted([torrent for sha1, torrent in torrents.items()], key=lambda x:x.dict.get(b'creation date')):\n",
" č = torrent.dict.get(b'creation date')\n",
" dat += sum(1 for path, size in torrent.paths())\n",
" vel += sum(size for path, size in torrent.paths())\n",
" if not prej:\n",
" prej = č\n",
" continue\n",
" if prej + 60*10 > č:\n",
" skup += č-prej\n",
" prej = č\n",
"print(monotonic()-s, \"torrenti so se zbirali\", skup/86400, \"dni. en torrent je bil najden v povprečju na\", skup/len(torrents), \"sekund, v\", len(torrents), \"so metapodatki\", dat, \"datotek\", \"v skupni velikosti\", vel/(1024**4), \"TiB\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e170de45",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"s = monotonic()\n",
"def uas(normalize=True, minrepr=0):\n",
" odjemalci = {}\n",
" for sha1, torrent in torrents.items():\n",
" odjemalec = torrent.dict.get(b'source').get(b'v')\n",
" if normalize and odjemalec is not None:\n",
" if b'/' in odjemalec:\n",
" odjemalec = odjemalec.split(b'/')[0]\n",
" elif b' (' in odjemalec:\n",
" odjemalec = odjemalec.split(b' (')[0]\n",
" else:\n",
" odjemalec = odjemalec.split(b' ')[0]\n",
" odjemalec = odjemalec.replace(b'\\xc2\\xb5', b'\\xce\\xbc').decode()\n",
" if odjemalec not in odjemalci.keys():\n",
" odjemalci[odjemalec] = 1\n",
" else:\n",
" odjemalci[odjemalec] += 1\n",
" trueodj = {\"ostali\": 0}\n",
" count = 0\n",
" for key, value in odjemalci.items():\n",
" count += 1\n",
" if value < minrepr:\n",
" trueodj[\"ostali\"] += value\n",
" else:\n",
" trueodj[key] = value\n",
" trueodj = [(v, k) for k, v in trueodj.items()]\n",
" return trueodj, count\n",
"odjemalci, count = uas(True, minrepr=0.01*len(torrents))\n",
"odjemalci = sorted(odjemalci, reverse=False)\n",
"from matplotlib import pyplot\n",
"%matplotlib notebook\n",
"fig, axes = pyplot.subplots()\n",
"from math import log\n",
"# axes.pie([log(sights) if sights else 0 for sights, name in odjemalci], labels=[name for sights, name in odjemalci])\n",
"axes.barh([name if name is not None else \"neznan\" for sights, name in odjemalci], [sights for sights, name in odjemalci])\n",
"axes.set_title(\"log skala odjemalcev\")\n",
"pyplot.xscale(\"log\")\n",
"fig.show()\n",
"print(monotonic()-s, \"za\", count, \"različnih odjemalcev\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52de34d6",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"s = monotonic()\n",
"keys = {}\n",
"for sha1, torrent in torrents.items():\n",
" for key in torrent.dict.get(b'info').keys():\n",
" if key.decode() not in keys.keys():\n",
" value = torrent.dict.get(b'info').get(key)\n",
" if type(value) is bytes:\n",
" try:\n",
" value = value.decode()\n",
" except UnicodeDecodeError:\n",
" pass\n",
" keys[key.decode()] = [1, value, sha1.hex()]\n",
" else:\n",
" keys[key.decode()][0] += 1\n",
"sort = sorted(keys, key=lambda x: keys[x][0])\n",
"print(monotonic()-s, \"s\", len(keys))\n",
"%matplotlib notebook\n",
"fig, ax = pyplot.subplots();\n",
"ax.barh(sort, [keys[x][0] for x in sort])\n",
"pyplot.xscale(\"log\")\n",
"pyplot.xlabel(\"število pojavitev ključa v slovarju info\")\n",
"fig.show() ## TODO komentiraj\n",
"for i in sort:\n",
" print(i, keys[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fea0f2b6",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"s = monotonic()\n",
"def removeminorities(population, minrepr=0, ostalo=\"ostalo\"):\n",
" true = {ostalo: 0}\n",
" for key, value in population.items():\n",
" if value < minrepr:\n",
" true[ostalo] += value\n",
" else:\n",
" true[key] = value\n",
" return true\n",
"def sources():\n",
" sources = {}\n",
" for sha1, torrent in torrents.items():\n",
" source = torrent.dict.get(b'info').get(b'source')\n",
" if source is None:\n",
" source = torrent.dict.get(b'info').get(b'publisher')\n",
" if source is None:\n",
" source = torrent.dict.get(b'info').get(b'publisher-url')\n",
" if source is None:\n",
" source = torrent.dict.get(b'info').get(b'comment')\n",
" try:\n",
" if type(source) is bytes:\n",
" source = source.decode().strip()\n",
" except UnicodeDecodeError:\n",
" pass\n",
" if source not in sources.keys():\n",
" sources[source] = 1\n",
" else:\n",
" sources[source] += 1\n",
" return sources\n",
"sources = sources()\n",
"sources = removeminorities(sources, len(sources)*0, \"ostali\")\n",
"sort = sorted(sources, reverse=True, key=lambda x:sources[x])\n",
"sort.remove(None)\n",
"print(monotonic()-s, \"s\", sources[None]/len(torrents)*100, \"brez ključa source, publisher, publisher-url ali comment\", len(sources), \"virov\")\n",
"%matplotlib notebook\n",
"fig, ax = pyplot.subplots();\n",
"ax.barh([str(x) for x in sort], [sources[x] for x in sort])\n",
"pyplot.xscale(\"log\")\n",
"pyplot.xlabel(\"število pojavitev distributerja\")\n",
"fig.show() ## TODO komentiraj\n",
"from tabulate import tabulate\n",
"tabulate([[x, sources[x]] for x in sort], tablefmt=\"html\")\n",
"for x in sort:\n",
" print(sources[x], \"\\t\", x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bd1f517",
"metadata": {},
"outputs": [],
"source": [
"s = monotonic()\n",
"from mimetypes import guess_type\n",
"def ext(mime=False, minreprratio=0):\n",
" bycount = {}\n",
" bysize = {}\n",
" bysizerepresentative = {}\n",
" filescount = 0\n",
" bytescount = 0\n",
" for sha1, torrent in torrents.items():\n",
" try:\n",
" representatives = {}\n",
" for path, size in torrent.paths():\n",
" filescount += 1\n",
" bytescount += size\n",
" if mime:\n",
" ext = guess_type(path.pop().decode(encoding=\"iso-8859-2\"))[0]\n",
" else:\n",
" ext = path.pop().split(b'.').pop().decode(encoding=\"iso-8859-2\").lower()\n",
" if ext not in bycount.keys():\n",
" bycount[ext] = 1\n",
" else:\n",
" bycount[ext] += 1\n",
" if ext not in bysize.keys():\n",
" bysize[ext] = size\n",
" else:\n",
" bysize[ext] += size\n",
" if ext not in representatives.keys():\n",
" representatives[ext] = size\n",
" else:\n",
" representatives[ext] += size\n",
" except AttributeError:\n",
" print(sha1.hex(), torrent)\n",
" raise AttributeError\n",
" try:\n",
" representative = sorted(representatives, key=lambda x:representatives[x]).pop()\n",
" except IndexError:\n",
" print(sha1.hex(), torrent)\n",
" raise IndexError\n",
" if representative not in bysizerepresentative.keys():\n",
" bysizerepresentative[representative] = 1\n",
" else:\n",
" bysizerepresentative[representative] += 1\n",
" truebycount = removeminorities(bycount, minreprratio*filescount, \"ostale\")\n",
" truebysize = removeminorities(bysize, minreprratio*bytescount, \"ostale\")\n",
" truebysizerepresentative = removeminorities(bysizerepresentative, minreprratio*len(torrents), \"ostale\")\n",
" for data in [truebycount, truebysize, truebysizerepresentative]:\n",
" data = [(v, k) for k, v in data.items()]\n",
" return truebycount, truebysize, truebysizerepresentative, len(bycount), filescount, bytescount\n",
"print(\"...\")\n",
"bycount, bysize, bysizerepresentative, kinds, filescount, bytescount = ext(False, 0.001)\n",
"print(monotonic()-s, \"s\", kinds, \"različnih tipov v\", filescount, \"datotekah in\", bytescount/(1024**4), \"TiB\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82ab922a",
"metadata": {},
"outputs": [],
"source": [
"sortcount = sorted(bycount, reverse=False, key=lambda x: bycount[x])\n",
"sortsize = sorted(bysize, reverse=False, key=lambda x: bysize[x])\n",
"sortsizerepresentative = sorted(bysizerepresentative, reverse=False, key=lambda x: bysizerepresentative[x])\n",
"from matplotlib import pyplot\n",
"%matplotlib notebook\n",
"for desc, data in {\"po številu datotek\": (sortcount, bycount), \"po velikosti datotek\": (sortsize, bysize), \"po številu po velikosti največjih datotek torrentov\": (sortsizerepresentative, bysizerepresentative)}.items():\n",
" fig, axes = pyplot.subplots()\n",
" # axes.pie([data[1][key] for key in data[0]], labels=data[0])\n",
" axes.barh(data[0], [data[1][key] for key in data[0]])\n",
" pyplot.xscale(\"log\")\n",
" axes.set_title(desc)\n",
" fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fca757e3",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}