246 lines
6.8 KiB
Plaintext
Executable File
246 lines
6.8 KiB
Plaintext
Executable File
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import math\n",
|
|
"import string\n",
|
|
"from collections import Counter"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Этап 1: Чтение текста из файла"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open('some_text.txt', 'r', encoding='utf-8') as file:\n",
|
|
" text = file.read()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Этап 2: Привести все слова к нижнему регистру, удалить знаки препинания и пробелы"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"text = text.lower() # Приведение к нижнему регистру\n",
|
|
"text = text.translate(str.maketrans('', '', string.punctuation + '\\n')) # Удаление знаков препинания\n",
|
|
"text = text.replace(\" \", \"\") # Удаление пробелов"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open('new_text.txt', 'w', encoding='utf-8') as file:\n",
|
|
" file.write(text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Этап 3: Подсчитать частоту появления однобуквенных и двухбуквенных сочетаний"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def calculate_frequencies(text):\n",
|
|
" single_char_freq = Counter(text) # Частоты однобуквенных сочетаний\n",
|
|
" bigrams = [text[i:i+2] for i in range(len(text)-1)] # Двухбуквенные сочетания\n",
|
|
" bigram_freq = Counter(bigrams)\n",
|
|
" return single_char_freq, bigram_freq"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"single_char_freq, bigram_freq = calculate_frequencies(text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Этап 4: Определение энтропии"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def calculate_entropy(freq_dict, total_count):\n",
|
|
" entropy = 0\n",
|
|
" for freq in freq_dict.values():\n",
|
|
" p = freq / total_count\n",
|
|
" entropy -= p * math.log2(p)\n",
|
|
" return entropy"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Энтропия на одну букву: 3.9961505474490147\n",
|
|
"Энтропия на двухбуквенное сочетание: 7.389736513198184\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"single_char_entropy = calculate_entropy(single_char_freq, len(text))\n",
|
|
"bigram_entropy = calculate_entropy(bigram_freq, len(text)-1)\n",
|
|
"\n",
|
|
"print(f\"Энтропия на одну букву: {single_char_entropy}\")\n",
|
|
"print(f\"Энтропия на двухбуквенное сочетание: {bigram_entropy}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Этап 5: Длина кода и избыточность"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Длина кода при равномерном кодировании: 4.459431618637297\n",
|
|
"Избыточность: 0.46328107118828266\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"alphabet_size = len(single_char_freq)\n",
|
|
"uniform_code_length = math.log2(alphabet_size)\n",
|
|
"redundancy = uniform_code_length - single_char_entropy\n",
|
|
"\n",
|
|
"print(f\"Длина кода при равномерном кодировании: {uniform_code_length}\")\n",
|
|
"print(f\"Избыточность: {redundancy}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Этап 6: Удаление 20% наиболее частых символов"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 34,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Энтропия после удаления 20% самых частых символов: 3.7022435891926633\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"num_to_remove = int(len(single_char_freq) * 0.2)\n",
|
|
"most_common = [char for char, _ in single_char_freq.most_common(num_to_remove)]\n",
|
|
"text_without_frequent = ''.join([char for char in text if char not in most_common])\n",
|
|
"\n",
|
|
"new_single_char_freq, _ = calculate_frequencies(text_without_frequent)\n",
|
|
"new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_frequent))\n",
|
|
"\n",
|
|
"print(f\"Энтропия после удаления 20% самых частых символов: {new_single_char_entropy}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Этап 7: Удаление 20% наиболее редких символов"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Энтропия после удаления 20% самых редких символов: 3.9045244598725413\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"num_to_remove = int(len(single_char_freq) * 0.2)\n",
|
|
"least_common = [char for char, _ in single_char_freq.most_common()[:-num_to_remove-1:-1]]\n",
|
|
"text_without_rare = ''.join([char for char in text if char not in least_common])\n",
|
|
"\n",
|
|
"new_single_char_freq, _ = calculate_frequencies(text_without_rare)\n",
|
|
"new_single_char_entropy = calculate_entropy(new_single_char_freq, len(text_without_rare))\n",
|
|
"\n",
|
|
"print(f\"Энтропия после удаления 20% самых редких символов: {new_single_char_entropy}\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|