first commit

This commit is contained in:
Adolfo Delorenzo 2025-04-04 13:23:15 -06:00
commit 216064f731
2103 changed files with 522593 additions and 0 deletions

0
README.md Normal file
View File

108
app.py Normal file
View File

@ -0,0 +1,108 @@
from flask import Flask, render_template, request, jsonify
import requests
import json
import speech_recognition as sr
import io
import base64
import os
app = Flask(__name__)
SUPPORTED_LANGUAGES = {
"arabic": "Arabic",
"armenian": "Armenian",
"azerbaijani": "Azerbaijani",
"english": "English",
"french": "French",
"georgian": "Georgian",
"kazakh": "Kazakh",
"mandarin": "Mandarin Chinese",
"persian": "Persian (Farsi)",
"portuguese": "Portuguese",
"russian": "Russian",
"turkish": "Turkish",
"uzbek": "Uzbek"
}
OLLAMA_API_URL = "http://100.64.0.4:11434/api/generate"
@app.route('/')
def index():
return render_template('index.html', languages=SUPPORTED_LANGUAGES)
@app.route('/translate', methods=['POST'])
def translate():
data = request.json
source_language = data.get('sourceLanguage')
target_language = data.get('targetLanguage')
text = data.get('text')
if not all([source_language, target_language, text]):
return jsonify({"error": "Missing required parameters"}), 400
# Create prompt for Gemma 3
prompt = f"""Translate the following text from {source_language} to {target_language}.
Text to translate: {text}
Provide ONLY the translated text with no additional commentary, explanations, or formatting.
"""
# Call Ollama API with the Gemma 3 model
payload = {
"model": "gemma3:12b",
"prompt": prompt,
"stream": False
}
try:
response = requests.post(OLLAMA_API_URL, json=payload)
response.raise_for_status()
result = response.json()
# Extract the generated translation
translation = result.get("response", "").strip()
return jsonify({"translation": translation})
except Exception as e:
return jsonify({"error": f"Translation failed: {str(e)}"}), 500
@app.route('/speech-to-text', methods=['POST'])
def speech_to_text():
try:
audio_data = request.json.get('audio')
language_code = request.json.get('language')
# Convert base64 audio to file
audio_bytes = base64.b64decode(audio_data.split(',')[1])
# Use speech recognition
recognizer = sr.Recognizer()
with sr.AudioFile(io.BytesIO(audio_bytes)) as source:
audio = recognizer.record(source)
# Convert speech to text
language_code_map = {
"arabic": "ar-AR",
"armenian": "hy-AM",
"azerbaijani": "az-AZ",
"english": "en-US",
"french": "fr-FR",
"georgian": "ka-GE",
"kazakh": "kk-KZ",
"mandarin": "zh-CN",
"persian": "fa-IR",
"portuguese": "pt-PT",
"russian": "ru-RU",
"turkish": "tr-TR",
"uzbek": "uz-UZ"
}
lang_code = language_code_map.get(language_code, "en-US")
text = recognizer.recognize_google(audio, language=lang_code)
return jsonify({"text": text})
except Exception as e:
return jsonify({"error": f"Speech recognition failed: {str(e)}"}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5005, debug=True)

236
static/css/style.css Normal file
View File

@ -0,0 +1,236 @@
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
background-color: #f5f7fa;
color: #333;
line-height: 1.6;
}
.container {
max-width: 800px;
margin: 0 auto;
padding: 20px;
}
h1 {
text-align: center;
margin-bottom: 5px;
color: #2c3e50;
}
.subtitle {
text-align: center;
color: #7f8c8d;
margin-bottom: 30px;
font-size: 0.9rem;
}
.translation-panel {
background-color: white;
border-radius: 12px;
box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1);
padding: 20px;
margin-bottom: 20px;
}
.language-selector {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 20px;
flex-wrap: wrap;
}
.select-container {
flex: 1;
min-width: 120px;
margin: 5px;
}
.select-container label {
display: block;
margin-bottom: 5px;
font-size: 0.9rem;
color: #7f8c8d;
}
select {
width: 100%;
padding: 10px;
border: 1px solid #ddd;
border-radius: 6px;
background-color: #f9f9f9;
font-size: 1rem;
}
#swapLanguages {
background-color: #e8f4fc;
border: none;
width: 40px;
height: 40px;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
font-size: 1.2rem;
cursor: pointer;
transition: background-color 0.3s;
margin: 0 10px;
}
#swapLanguages:hover {
background-color: #d1e9f9;
}
.text-panels {
display: flex;
flex-direction: column;
gap: 20px;
margin-bottom: 20px;
}
.text-panel {
flex: 1;
}
textarea {
width: 100%;
height: 120px;
padding: 15px;
border: 1px solid #ddd;
border-radius: 8px;
font-size: 1rem;
resize: none;
margin-bottom: 10px;
}
.controls {
display: flex;
gap: 10px;
}
button {
padding: 10px 15px;
border-radius: 6px;
border: none;
cursor: pointer;
font-size: 0.9rem;
transition: background-color 0.3s, transform 0.1s;
}
button:active {
transform: translateY(1px);
}
.record-button, .speak-button {
background-color: #e8f4fc;
color: #3498db;
display: flex;
align-items: center;
gap: 5px;
}
.record-button:hover, .speak-button:hover {
background-color: #d1e9f9;
}
.record-button.recording {
background-color: #ffe9e9;
color: #e74c3c;
animation: pulse 1.5s infinite;
}
.clear-button, .copy-button {
background-color: #f5f5f5;
color: #7f8c8d;
}
.clear-button:hover, .copy-button:hover {
background-color: #e9e9e9;
}
.primary-button {
width: 100%;
padding: 15px;
background-color: #3498db;
color: white;
font-size: 1rem;
font-weight: 500;
border-radius: 8px;
}
.primary-button:hover {
background-color: #2980b9;
}
.status-message {
text-align: center;
min-height: 24px;
color: #7f8c8d;
font-size: 0.9rem;
}
.status-message.error {
color: #e74c3c;
}
.status-message.success {
color: #2ecc71;
}
/* Animation for recording */
@keyframes pulse {
0% {
opacity: 1;
}
50% {
opacity: 0.7;
}
100% {
opacity: 1;
}
}
/* Media queries for responsiveness */
@media (min-width: 768px) {
.text-panels {
flex-direction: row;
}
textarea {
height: 150px;
}
}
@media (max-width: 480px) {
.container {
padding: 10px;
}
h1 {
font-size: 1.5rem;
}
.translation-panel {
padding: 15px;
}
textarea {
height: 100px;
}
.button-text {
display: none;
}
.record-button, .speak-button, .clear-button, .copy-button {
padding: 10px;
flex: 1;
justify-content: center;
}
}

255
static/js/main.js Normal file
View File

@ -0,0 +1,255 @@
document.addEventListener('DOMContentLoaded', function() {
// DOM elements
const sourceLanguage = document.getElementById('sourceLanguage');
const targetLanguage = document.getElementById('targetLanguage');
const swapButton = document.getElementById('swapLanguages');
const sourceText = document.getElementById('sourceText');
const translatedText = document.getElementById('translatedText');
const recordSourceButton = document.getElementById('recordSource');
const speakButton = document.getElementById('speak');
const clearSourceButton = document.getElementById('clearSource');
const copyTranslationButton = document.getElementById('copyTranslation');
const translateButton = document.getElementById('translateButton');
const statusMessage = document.getElementById('status');
// Audio recording variables
let mediaRecorder;
let audioChunks = [];
let isRecording = false;
// Speech recognition setup
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
const recognition = SpeechRecognition ? new SpeechRecognition() : null;
if (recognition) {
recognition.continuous = false;
recognition.interimResults = false;
}
// Event listeners
swapButton.addEventListener('click', swapLanguages);
translateButton.addEventListener('click', translateText);
clearSourceButton.addEventListener('click', clearSource);
copyTranslationButton.addEventListener('click', copyTranslation);
if (recognition) {
recordSourceButton.addEventListener('click', toggleRecording);
} else {
recordSourceButton.textContent = "Speech API not supported";
recordSourceButton.disabled = true;
}
speakButton.addEventListener('click', speakTranslation);
// Functions (continued)
function swapLanguages() {
const tempLang = sourceLanguage.value;
sourceLanguage.value = targetLanguage.value;
targetLanguage.value = tempLang;
// Also swap the text if both fields have content
if (sourceText.value && translatedText.value) {
const tempText = sourceText.value;
sourceText.value = translatedText.value;
translatedText.value = tempText;
}
}
function clearSource() {
sourceText.value = '';
updateStatus('');
}
function copyTranslation() {
if (!translatedText.value) {
updateStatus('Nothing to copy', 'error');
return;
}
navigator.clipboard.writeText(translatedText.value)
.then(() => {
updateStatus('Copied to clipboard!', 'success');
setTimeout(() => updateStatus(''), 2000);
})
.catch(err => {
updateStatus('Failed to copy: ' + err, 'error');
});
}
async function translateText() {
const source = sourceText.value.trim();
if (!source) {
updateStatus('Please enter or speak some text to translate', 'error');
return;
}
updateStatus('Translating...');
translatedText.value = '';
try {
const response = await fetch('/translate', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
sourceLanguage: sourceLanguage.value,
targetLanguage: targetLanguage.value,
text: source
})
});
const data = await response.json();
if (response.ok) {
translatedText.value = data.translation;
updateStatus('Translation complete', 'success');
setTimeout(() => updateStatus(''), 2000);
} else {
updateStatus(data.error || 'Translation failed', 'error');
}
} catch (error) {
updateStatus('Network error: ' + error.message, 'error');
}
}
function toggleRecording() {
if (!recognition) {
updateStatus('Speech recognition not supported in this browser', 'error');
return;
}
if (isRecording) {
stopRecording();
} else {
startRecording();
}
}
function startRecording() {
sourceText.value = '';
updateStatus('Listening...');
recognition.lang = getLanguageCode(sourceLanguage.value);
recognition.onresult = function(event) {
const transcript = event.results[0][0].transcript;
sourceText.value = transcript;
updateStatus('Recording completed', 'success');
setTimeout(() => updateStatus(''), 2000);
};
recognition.onerror = function(event) {
updateStatus('Error in speech recognition: ' + event.error, 'error');
stopRecording();
};
recognition.onend = function() {
stopRecording();
};
try {
recognition.start();
isRecording = true;
recordSourceButton.classList.add('recording');
recordSourceButton.querySelector('.button-text').textContent = 'Stop';
} catch (error) {
updateStatus('Failed to start recording: ' + error.message, 'error');
}
}
function stopRecording() {
if (isRecording) {
try {
recognition.stop();
} catch (error) {
console.error('Error stopping recognition:', error);
}
isRecording = false;
recordSourceButton.classList.remove('recording');
recordSourceButton.querySelector('.button-text').textContent = 'Record';
}
}
function speakTranslation() {
const text = translatedText.value.trim();
if (!text) {
updateStatus('No translation to speak', 'error');
return;
}
// Use the browser's speech synthesis API
const speech = new SpeechSynthesisUtterance(text);
speech.lang = getLanguageCode(targetLanguage.value);
speech.volume = 1;
speech.rate = 1;
speech.pitch = 1;
speech.onstart = function() {
updateStatus('Speaking...');
speakButton.disabled = true;
};
speech.onend = function() {
updateStatus('');
speakButton.disabled = false;
};
speech.onerror = function(event) {
updateStatus('Speech synthesis error: ' + event.error, 'error');
speakButton.disabled = false;
};
window.speechSynthesis.speak(speech);
}
function getLanguageCode(language) {
// Map language names to BCP 47 language tags for speech recognition/synthesis
const languageMap = {
"arabic": "ar-SA",
"armenian": "hy-AM",
"azerbaijani": "az-AZ",
"english": "en-US",
"french": "fr-FR",
"georgian": "ka-GE",
"kazakh": "kk-KZ",
"mandarin": "zh-CN",
"persian": "fa-IR",
"portuguese": "pt-PT",
"russian": "ru-RU",
"turkish": "tr-TR",
"uzbek": "uz-UZ"
};
return languageMap[language] || 'en-US';
}
function updateStatus(message, type = '') {
statusMessage.textContent = message;
statusMessage.className = 'status-message';
if (type) {
statusMessage.classList.add(type);
}
}
// Check for microphone and speech support when page loads
function checkSupportedFeatures() {
if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) {
updateStatus('Microphone access is not supported in this browser', 'error');
recordSourceButton.disabled = true;
}
if (!window.SpeechRecognition && !window.webkitSpeechRecognition) {
updateStatus('Speech recognition is not supported in this browser', 'error');
recordSourceButton.disabled = true;
}
if (!window.speechSynthesis) {
updateStatus('Speech synthesis is not supported in this browser', 'error');
speakButton.disabled = true;
}
}
checkSupportedFeatures();
});

71
templates/index.html Normal file
View File

@ -0,0 +1,71 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Multilingual Voice Translator</title>
<link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
</head>
<body>
<div class="container">
<h1>Voice Translator</h1>
<p class="subtitle">Powered by Gemma 3</p>
<div class="translation-panel">
<div class="language-selector">
<div class="select-container">
<label for="sourceLanguage">From:</label>
<select id="sourceLanguage">
{% for code, name in languages.items() %}
<option value="{{ code }}" {% if code == "english" %}selected{% endif %}>{{ name }}</option>
{% endfor %}
</select>
</div>
<button id="swapLanguages" aria-label="Swap languages">
<span></span>
</button>
<div class="select-container">
<label for="targetLanguage">To:</label>
<select id="targetLanguage">
{% for code, name in languages.items() %}
<option value="{{ code }}" {% if code == "french" %}selected{% endif %}>{{ name }}</option>
{% endfor %}
</select>
</div>
</div>
<div class="text-panels">
<div class="text-panel">
<textarea id="sourceText" placeholder="Speak or type text to translate"></textarea>
<div class="controls">
<button id="recordSource" class="record-button">
<span class="mic-icon">🎤</span>
<span class="button-text">Record</span>
</button>
<button id="clearSource" class="clear-button">Clear</button>
</div>
</div>
<div class="text-panel">
<textarea id="translatedText" placeholder="Translation will appear here" readonly></textarea>
<div class="controls">
<button id="speak" class="speak-button">
<span class="speaker-icon">🔊</span>
<span class="button-text">Speak</span>
</button>
<button id="copyTranslation" class="copy-button">Copy</button>
</div>
</div>
</div>
<button id="translateButton" class="primary-button">Translate</button>
</div>
<div id="status" class="status-message"></div>
</div>
<script src="{{ url_for('static', filename='js/main.js') }}"></script>
</body>
</html>

247
venv/bin/Activate.ps1 Normal file
View File

@ -0,0 +1,247 @@
<#
.Synopsis
Activate a Python virtual environment for the current PowerShell session.
.Description
Pushes the python executable for a virtual environment to the front of the
$Env:PATH environment variable and sets the prompt to signify that you are
in a Python virtual environment. Makes use of the command line switches as
well as the `pyvenv.cfg` file values present in the virtual environment.
.Parameter VenvDir
Path to the directory that contains the virtual environment to activate. The
default value for this is the parent of the directory that the Activate.ps1
script is located within.
.Parameter Prompt
The prompt prefix to display when this virtual environment is activated. By
default, this prompt is the name of the virtual environment folder (VenvDir)
surrounded by parentheses and followed by a single space (ie. '(.venv) ').
.Example
Activate.ps1
Activates the Python virtual environment that contains the Activate.ps1 script.
.Example
Activate.ps1 -Verbose
Activates the Python virtual environment that contains the Activate.ps1 script,
and shows extra information about the activation as it executes.
.Example
Activate.ps1 -VenvDir C:\Users\MyUser\Common\.venv
Activates the Python virtual environment located in the specified location.
.Example
Activate.ps1 -Prompt "MyPython"
Activates the Python virtual environment that contains the Activate.ps1 script,
and prefixes the current prompt with the specified string (surrounded in
parentheses) while the virtual environment is active.
.Notes
On Windows, it may be required to enable this Activate.ps1 script by setting the
execution policy for the user. You can do this by issuing the following PowerShell
command:
PS C:\> Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
For more information on Execution Policies:
https://go.microsoft.com/fwlink/?LinkID=135170
#>
Param(
[Parameter(Mandatory = $false)]
[String]
$VenvDir,
[Parameter(Mandatory = $false)]
[String]
$Prompt
)
<# Function declarations --------------------------------------------------- #>
<#
.Synopsis
Remove all shell session elements added by the Activate script, including the
addition of the virtual environment's Python executable from the beginning of
the PATH variable.
.Parameter NonDestructive
If present, do not remove this function from the global namespace for the
session.
#>
function global:deactivate ([switch]$NonDestructive) {
# Revert to original values
# The prior prompt:
if (Test-Path -Path Function:_OLD_VIRTUAL_PROMPT) {
Copy-Item -Path Function:_OLD_VIRTUAL_PROMPT -Destination Function:prompt
Remove-Item -Path Function:_OLD_VIRTUAL_PROMPT
}
# The prior PYTHONHOME:
if (Test-Path -Path Env:_OLD_VIRTUAL_PYTHONHOME) {
Copy-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME -Destination Env:PYTHONHOME
Remove-Item -Path Env:_OLD_VIRTUAL_PYTHONHOME
}
# The prior PATH:
if (Test-Path -Path Env:_OLD_VIRTUAL_PATH) {
Copy-Item -Path Env:_OLD_VIRTUAL_PATH -Destination Env:PATH
Remove-Item -Path Env:_OLD_VIRTUAL_PATH
}
# Just remove the VIRTUAL_ENV altogether:
if (Test-Path -Path Env:VIRTUAL_ENV) {
Remove-Item -Path env:VIRTUAL_ENV
}
# Just remove VIRTUAL_ENV_PROMPT altogether.
if (Test-Path -Path Env:VIRTUAL_ENV_PROMPT) {
Remove-Item -Path env:VIRTUAL_ENV_PROMPT
}
# Just remove the _PYTHON_VENV_PROMPT_PREFIX altogether:
if (Get-Variable -Name "_PYTHON_VENV_PROMPT_PREFIX" -ErrorAction SilentlyContinue) {
Remove-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Scope Global -Force
}
# Leave deactivate function in the global namespace if requested:
if (-not $NonDestructive) {
Remove-Item -Path function:deactivate
}
}
<#
.Description
Get-PyVenvConfig parses the values from the pyvenv.cfg file located in the
given folder, and returns them in a map.
For each line in the pyvenv.cfg file, if that line can be parsed into exactly
two strings separated by `=` (with any amount of whitespace surrounding the =)
then it is considered a `key = value` line. The left hand string is the key,
the right hand is the value.
If the value starts with a `'` or a `"` then the first and last character is
stripped from the value before being captured.
.Parameter ConfigDir
Path to the directory that contains the `pyvenv.cfg` file.
#>
function Get-PyVenvConfig(
[String]
$ConfigDir
) {
Write-Verbose "Given ConfigDir=$ConfigDir, obtain values in pyvenv.cfg"
# Ensure the file exists, and issue a warning if it doesn't (but still allow the function to continue).
$pyvenvConfigPath = Join-Path -Resolve -Path $ConfigDir -ChildPath 'pyvenv.cfg' -ErrorAction Continue
# An empty map will be returned if no config file is found.
$pyvenvConfig = @{ }
if ($pyvenvConfigPath) {
Write-Verbose "File exists, parse `key = value` lines"
$pyvenvConfigContent = Get-Content -Path $pyvenvConfigPath
$pyvenvConfigContent | ForEach-Object {
$keyval = $PSItem -split "\s*=\s*", 2
if ($keyval[0] -and $keyval[1]) {
$val = $keyval[1]
# Remove extraneous quotations around a string value.
if ("'""".Contains($val.Substring(0, 1))) {
$val = $val.Substring(1, $val.Length - 2)
}
$pyvenvConfig[$keyval[0]] = $val
Write-Verbose "Adding Key: '$($keyval[0])'='$val'"
}
}
}
return $pyvenvConfig
}
<# Begin Activate script --------------------------------------------------- #>
# Determine the containing directory of this script
$VenvExecPath = Split-Path -Parent $MyInvocation.MyCommand.Definition
$VenvExecDir = Get-Item -Path $VenvExecPath
Write-Verbose "Activation script is located in path: '$VenvExecPath'"
Write-Verbose "VenvExecDir Fullname: '$($VenvExecDir.FullName)"
Write-Verbose "VenvExecDir Name: '$($VenvExecDir.Name)"
# Set values required in priority: CmdLine, ConfigFile, Default
# First, get the location of the virtual environment, it might not be
# VenvExecDir if specified on the command line.
if ($VenvDir) {
Write-Verbose "VenvDir given as parameter, using '$VenvDir' to determine values"
}
else {
Write-Verbose "VenvDir not given as a parameter, using parent directory name as VenvDir."
$VenvDir = $VenvExecDir.Parent.FullName.TrimEnd("\\/")
Write-Verbose "VenvDir=$VenvDir"
}
# Next, read the `pyvenv.cfg` file to determine any required value such
# as `prompt`.
$pyvenvCfg = Get-PyVenvConfig -ConfigDir $VenvDir
# Next, set the prompt from the command line, or the config file, or
# just use the name of the virtual environment folder.
if ($Prompt) {
Write-Verbose "Prompt specified as argument, using '$Prompt'"
}
else {
Write-Verbose "Prompt not specified as argument to script, checking pyvenv.cfg value"
if ($pyvenvCfg -and $pyvenvCfg['prompt']) {
Write-Verbose " Setting based on value in pyvenv.cfg='$($pyvenvCfg['prompt'])'"
$Prompt = $pyvenvCfg['prompt'];
}
else {
Write-Verbose " Setting prompt based on parent's directory's name. (Is the directory name passed to venv module when creating the virtual environment)"
Write-Verbose " Got leaf-name of $VenvDir='$(Split-Path -Path $venvDir -Leaf)'"
$Prompt = Split-Path -Path $venvDir -Leaf
}
}
Write-Verbose "Prompt = '$Prompt'"
Write-Verbose "VenvDir='$VenvDir'"
# Deactivate any currently active virtual environment, but leave the
# deactivate function in place.
deactivate -nondestructive
# Now set the environment variable VIRTUAL_ENV, used by many tools to determine
# that there is an activated venv.
$env:VIRTUAL_ENV = $VenvDir
if (-not $Env:VIRTUAL_ENV_DISABLE_PROMPT) {
Write-Verbose "Setting prompt to '$Prompt'"
# Set the prompt to include the env name
# Make sure _OLD_VIRTUAL_PROMPT is global
function global:_OLD_VIRTUAL_PROMPT { "" }
Copy-Item -Path function:prompt -Destination function:_OLD_VIRTUAL_PROMPT
New-Variable -Name _PYTHON_VENV_PROMPT_PREFIX -Description "Python virtual environment prompt prefix" -Scope Global -Option ReadOnly -Visibility Public -Value $Prompt
function global:prompt {
Write-Host -NoNewline -ForegroundColor Green "($_PYTHON_VENV_PROMPT_PREFIX) "
_OLD_VIRTUAL_PROMPT
}
$env:VIRTUAL_ENV_PROMPT = $Prompt
}
# Clear PYTHONHOME
if (Test-Path -Path Env:PYTHONHOME) {
Copy-Item -Path Env:PYTHONHOME -Destination Env:_OLD_VIRTUAL_PYTHONHOME
Remove-Item -Path Env:PYTHONHOME
}
# Add the venv to the PATH
Copy-Item -Path Env:PATH -Destination Env:_OLD_VIRTUAL_PATH
$Env:PATH = "$VenvExecDir$([System.IO.Path]::PathSeparator)$Env:PATH"

69
venv/bin/activate Normal file
View File

@ -0,0 +1,69 @@
# This file must be used with "source bin/activate" *from bash*
# you cannot run it directly
deactivate () {
# reset old environment variables
if [ -n "${_OLD_VIRTUAL_PATH:-}" ] ; then
PATH="${_OLD_VIRTUAL_PATH:-}"
export PATH
unset _OLD_VIRTUAL_PATH
fi
if [ -n "${_OLD_VIRTUAL_PYTHONHOME:-}" ] ; then
PYTHONHOME="${_OLD_VIRTUAL_PYTHONHOME:-}"
export PYTHONHOME
unset _OLD_VIRTUAL_PYTHONHOME
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi
if [ -n "${_OLD_VIRTUAL_PS1:-}" ] ; then
PS1="${_OLD_VIRTUAL_PS1:-}"
export PS1
unset _OLD_VIRTUAL_PS1
fi
unset VIRTUAL_ENV
unset VIRTUAL_ENV_PROMPT
if [ ! "${1:-}" = "nondestructive" ] ; then
# Self destruct!
unset -f deactivate
fi
}
# unset irrelevant variables
deactivate nondestructive
VIRTUAL_ENV=/home/adelorenzo/repos/talk2me/venv
export VIRTUAL_ENV
_OLD_VIRTUAL_PATH="$PATH"
PATH="$VIRTUAL_ENV/"bin":$PATH"
export PATH
# unset PYTHONHOME if set
# this will fail if PYTHONHOME is set to the empty string (which is bad anyway)
# could use `if (set -u; : $PYTHONHOME) ;` in bash
if [ -n "${PYTHONHOME:-}" ] ; then
_OLD_VIRTUAL_PYTHONHOME="${PYTHONHOME:-}"
unset PYTHONHOME
fi
if [ -z "${VIRTUAL_ENV_DISABLE_PROMPT:-}" ] ; then
_OLD_VIRTUAL_PS1="${PS1:-}"
PS1='(venv) '"${PS1:-}"
export PS1
VIRTUAL_ENV_PROMPT='(venv) '
export VIRTUAL_ENV_PROMPT
fi
# This should detect bash and zsh, which have a hash command that must
# be called to get it to forget past commands. Without forgetting
# past commands the $PATH changes we made may not be respected
if [ -n "${BASH:-}" -o -n "${ZSH_VERSION:-}" ] ; then
hash -r 2> /dev/null
fi

26
venv/bin/activate.csh Normal file
View File

@ -0,0 +1,26 @@
# This file must be used with "source bin/activate.csh" *from csh*.
# You cannot run it directly.
# Created by Davide Di Blasi <davidedb@gmail.com>.
# Ported to Python 3.3 venv by Andrew Svetlov <andrew.svetlov@gmail.com>
alias deactivate 'test $?_OLD_VIRTUAL_PATH != 0 && setenv PATH "$_OLD_VIRTUAL_PATH" && unset _OLD_VIRTUAL_PATH; rehash; test $?_OLD_VIRTUAL_PROMPT != 0 && set prompt="$_OLD_VIRTUAL_PROMPT" && unset _OLD_VIRTUAL_PROMPT; unsetenv VIRTUAL_ENV; unsetenv VIRTUAL_ENV_PROMPT; test "\!:*" != "nondestructive" && unalias deactivate'
# Unset irrelevant variables.
deactivate nondestructive
setenv VIRTUAL_ENV /home/adelorenzo/repos/talk2me/venv
set _OLD_VIRTUAL_PATH="$PATH"
setenv PATH "$VIRTUAL_ENV/"bin":$PATH"
set _OLD_VIRTUAL_PROMPT="$prompt"
if (! "$?VIRTUAL_ENV_DISABLE_PROMPT") then
set prompt = '(venv) '"$prompt"
setenv VIRTUAL_ENV_PROMPT '(venv) '
endif
alias pydoc python -m pydoc
rehash

69
venv/bin/activate.fish Normal file
View File

@ -0,0 +1,69 @@
# This file must be used with "source <venv>/bin/activate.fish" *from fish*
# (https://fishshell.com/); you cannot run it directly.
function deactivate -d "Exit virtual environment and return to normal shell environment"
# reset old environment variables
if test -n "$_OLD_VIRTUAL_PATH"
set -gx PATH $_OLD_VIRTUAL_PATH
set -e _OLD_VIRTUAL_PATH
end
if test -n "$_OLD_VIRTUAL_PYTHONHOME"
set -gx PYTHONHOME $_OLD_VIRTUAL_PYTHONHOME
set -e _OLD_VIRTUAL_PYTHONHOME
end
if test -n "$_OLD_FISH_PROMPT_OVERRIDE"
set -e _OLD_FISH_PROMPT_OVERRIDE
# prevents error when using nested fish instances (Issue #93858)
if functions -q _old_fish_prompt
functions -e fish_prompt
functions -c _old_fish_prompt fish_prompt
functions -e _old_fish_prompt
end
end
set -e VIRTUAL_ENV
set -e VIRTUAL_ENV_PROMPT
if test "$argv[1]" != "nondestructive"
# Self-destruct!
functions -e deactivate
end
end
# Unset irrelevant variables.
deactivate nondestructive
set -gx VIRTUAL_ENV /home/adelorenzo/repos/talk2me/venv
set -gx _OLD_VIRTUAL_PATH $PATH
set -gx PATH "$VIRTUAL_ENV/"bin $PATH
# Unset PYTHONHOME if set.
if set -q PYTHONHOME
set -gx _OLD_VIRTUAL_PYTHONHOME $PYTHONHOME
set -e PYTHONHOME
end
if test -z "$VIRTUAL_ENV_DISABLE_PROMPT"
# fish uses a function instead of an env var to generate the prompt.
# Save the current fish_prompt function as the function _old_fish_prompt.
functions -c fish_prompt _old_fish_prompt
# With the original prompt function renamed, we can override with our own.
function fish_prompt
# Save the return status of the last command.
set -l old_status $status
# Output the venv prompt; color taken from the blue of the Python logo.
printf "%s%s%s" (set_color 4B8BBE) '(venv) ' (set_color normal)
# Restore the return status of the previous command.
echo "exit $old_status" | .
# Output the original/"old" prompt.
_old_fish_prompt
end
set -gx _OLD_FISH_PROMPT_OVERRIDE "$VIRTUAL_ENV"
set -gx VIRTUAL_ENV_PROMPT '(venv) '
end

8
venv/bin/flask Executable file
View File

@ -0,0 +1,8 @@
#!/home/adelorenzo/repos/talk2me/venv/bin/python3
# -*- coding: utf-8 -*-
import re
import sys
from flask.cli import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/normalizer Executable file
View File

@ -0,0 +1,8 @@
#!/home/adelorenzo/repos/talk2me/venv/bin/python3
# -*- coding: utf-8 -*-
import re
import sys
from charset_normalizer import cli
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(cli.cli_detect())

8
venv/bin/pip Executable file
View File

@ -0,0 +1,8 @@
#!/home/adelorenzo/repos/talk2me/venv/bin/python3
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/pip3 Executable file
View File

@ -0,0 +1,8 @@
#!/home/adelorenzo/repos/talk2me/venv/bin/python3
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

8
venv/bin/pip3.11 Executable file
View File

@ -0,0 +1,8 @@
#!/home/adelorenzo/repos/talk2me/venv/bin/python3
# -*- coding: utf-8 -*-
import re
import sys
from pip._internal.cli.main import main
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0])
sys.exit(main())

1
venv/bin/python Symbolic link
View File

@ -0,0 +1 @@
python3

1
venv/bin/python3 Symbolic link
View File

@ -0,0 +1 @@
/usr/bin/python3

1
venv/bin/python3.11 Symbolic link
View File

@ -0,0 +1 @@
python3

View File

@ -0,0 +1,28 @@
Copyright 2010 Pallets
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,92 @@
Metadata-Version: 2.1
Name: MarkupSafe
Version: 3.0.2
Summary: Safely add untrusted strings to HTML/XML markup.
Maintainer-email: Pallets <contact@palletsprojects.com>
License: Copyright 2010 Pallets
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Project-URL: Donate, https://palletsprojects.com/donate
Project-URL: Documentation, https://markupsafe.palletsprojects.com/
Project-URL: Changes, https://markupsafe.palletsprojects.com/changes/
Project-URL: Source, https://github.com/pallets/markupsafe/
Project-URL: Chat, https://discord.gg/pallets
Classifier: Development Status :: 5 - Production/Stable
Classifier: Environment :: Web Environment
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
Classifier: Topic :: Text Processing :: Markup :: HTML
Classifier: Typing :: Typed
Requires-Python: >=3.9
Description-Content-Type: text/markdown
License-File: LICENSE.txt
# MarkupSafe
MarkupSafe implements a text object that escapes characters so it is
safe to use in HTML and XML. Characters that have special meanings are
replaced so that they display as the actual characters. This mitigates
injection attacks, meaning untrusted user input can safely be displayed
on a page.
## Examples
```pycon
>>> from markupsafe import Markup, escape
>>> # escape replaces special characters and wraps in Markup
>>> escape("<script>alert(document.cookie);</script>")
Markup('&lt;script&gt;alert(document.cookie);&lt;/script&gt;')
>>> # wrap in Markup to mark text "safe" and prevent escaping
>>> Markup("<strong>Hello</strong>")
Markup('<strong>hello</strong>')
>>> escape(Markup("<strong>Hello</strong>"))
Markup('<strong>hello</strong>')
>>> # Markup is a str subclass
>>> # methods and operators escape their arguments
>>> template = Markup("Hello <em>{name}</em>")
>>> template.format(name='"World"')
Markup('Hello <em>&#34;World&#34;</em>')
```
## Donate
The Pallets organization develops and supports MarkupSafe and other
popular packages. In order to grow the community of contributors and
users, and allow the maintainers to devote more time to the projects,
[please donate today][].
[please donate today]: https://palletsprojects.com/donate

View File

@ -0,0 +1,14 @@
MarkupSafe-3.0.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
MarkupSafe-3.0.2.dist-info/LICENSE.txt,sha256=SJqOEQhQntmKN7uYPhHg9-HTHwvY-Zp5yESOf_N9B-o,1475
MarkupSafe-3.0.2.dist-info/METADATA,sha256=aAwbZhSmXdfFuMM-rEHpeiHRkBOGESyVLJIuwzHP-nw,3975
MarkupSafe-3.0.2.dist-info/RECORD,,
MarkupSafe-3.0.2.dist-info/WHEEL,sha256=Op2RVjKCU4Yd3uty1Wlljkjcwas4cTvIrdqkKFZWK28,153
MarkupSafe-3.0.2.dist-info/top_level.txt,sha256=qy0Plje5IJuvsCBjejJyhDCjEAdcDLK_2agVcex8Z6U,11
markupsafe/__init__.py,sha256=sr-U6_27DfaSrj5jnHYxWN-pvhM27sjlDplMDPZKm7k,13214
markupsafe/__pycache__/__init__.cpython-311.pyc,,
markupsafe/__pycache__/_native.cpython-311.pyc,,
markupsafe/_native.py,sha256=hSLs8Jmz5aqayuengJJ3kdT5PwNpBWpKrmQSdipndC8,210
markupsafe/_speedups.c,sha256=O7XulmTo-epI6n2FtMVOrJXl8EAaIwD2iNYmBI5SEoQ,4149
markupsafe/_speedups.cpython-311-aarch64-linux-gnu.so,sha256=ERBcuz-gl_TnODv5KWmFWXAr45_JjnsouJnevCcUXlc,98536
markupsafe/_speedups.pyi,sha256=ENd1bYe7gbBUf2ywyYWOGUpnXOHNJ-cgTNqetlW8h5k,41
markupsafe/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: setuptools (75.2.0)
Root-Is-Purelib: false
Tag: cp311-cp311-manylinux_2_17_aarch64
Tag: cp311-cp311-manylinux2014_aarch64

View File

@ -0,0 +1 @@
markupsafe

View File

@ -0,0 +1,339 @@
GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
`Gnomovision' (which makes passes at compilers) written by James Hacker.
<signature of Ty Coon>, 1 April 1989
Ty Coon, President of Vice
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License.

View File

@ -0,0 +1,12 @@
Copyright (c) 2014-, Anthony Zhang <azhang9@gmail.com>
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,464 @@
Metadata-Version: 2.1
Name: SpeechRecognition
Version: 3.14.2
Summary: Library for performing speech recognition, with support for several engines and APIs, online and offline.
Home-page: https://github.com/Uberi/speech_recognition#readme
Author: Anthony Zhang (Uberi)
Author-email: azhang9@gmail.com
License: BSD
Keywords: speech recognition voice sphinx google wit bing api houndify ibm snowboy
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: Natural Language :: English
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: Microsoft :: Windows
Classifier: Operating System :: POSIX :: Linux
Classifier: Operating System :: MacOS :: MacOS X
Classifier: Operating System :: Other OS
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
Requires-Python: >=3.9
Description-Content-Type: text/x-rst
License-File: LICENSE-FLAC.txt
License-File: LICENSE.txt
Requires-Dist: typing-extensions
Requires-Dist: standard-aifc ; python_version >= "3.13"
Requires-Dist: audioop-lts ; python_version >= "3.13"
Provides-Extra: assemblyai
Requires-Dist: requests ; extra == 'assemblyai'
Provides-Extra: audio
Requires-Dist: PyAudio (>=0.2.11) ; extra == 'audio'
Provides-Extra: dev
Requires-Dist: pytest ; extra == 'dev'
Requires-Dist: pytest-randomly ; extra == 'dev'
Requires-Dist: respx ; extra == 'dev'
Requires-Dist: numpy ; extra == 'dev'
Provides-Extra: faster-whisper
Requires-Dist: faster-whisper ; extra == 'faster-whisper'
Provides-Extra: google-cloud
Requires-Dist: google-cloud-speech ; extra == 'google-cloud'
Provides-Extra: groq
Requires-Dist: groq ; extra == 'groq'
Requires-Dist: httpx (<0.28) ; extra == 'groq'
Provides-Extra: openai
Requires-Dist: openai ; extra == 'openai'
Requires-Dist: httpx (<0.28) ; extra == 'openai'
Provides-Extra: pocketsphinx
Requires-Dist: pocketsphinx ; extra == 'pocketsphinx'
Provides-Extra: whisper-local
Requires-Dist: openai-whisper ; extra == 'whisper-local'
Requires-Dist: soundfile ; extra == 'whisper-local'
SpeechRecognition
=================
.. image:: https://img.shields.io/pypi/v/SpeechRecognition.svg
:target: https://pypi.python.org/pypi/SpeechRecognition/
:alt: Latest Version
.. image:: https://img.shields.io/pypi/status/SpeechRecognition.svg
:target: https://pypi.python.org/pypi/SpeechRecognition/
:alt: Development Status
.. image:: https://img.shields.io/pypi/pyversions/SpeechRecognition.svg
:target: https://pypi.python.org/pypi/SpeechRecognition/
:alt: Supported Python Versions
.. image:: https://img.shields.io/pypi/l/SpeechRecognition.svg
:target: https://pypi.python.org/pypi/SpeechRecognition/
:alt: License
.. image:: https://api.travis-ci.org/Uberi/speech_recognition.svg?branch=master
:target: https://travis-ci.org/Uberi/speech_recognition
:alt: Continuous Integration Test Results
Library for performing speech recognition, with support for several engines and APIs, online and offline.
**UPDATE 2022-02-09**: Hey everyone! This project started as a tech demo, but these days it needs more time than I have to keep up with all the PRs and issues. Therefore, I'd like to put out an **open invite for collaborators** - just reach out at me@anthonyz.ca if you're interested!
Speech recognition engine/API support:
* `CMU Sphinx <http://cmusphinx.sourceforge.net/wiki/>`__ (works offline)
* Google Speech Recognition
* `Google Cloud Speech API <https://cloud.google.com/speech/>`__
* `Wit.ai <https://wit.ai/>`__
* `Microsoft Azure Speech <https://azure.microsoft.com/en-us/services/cognitive-services/speech/>`__
* `Microsoft Bing Voice Recognition (Deprecated) <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__
* `Houndify API <https://houndify.com/>`__
* `IBM Speech to Text <http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/speech-to-text.html>`__
* `Snowboy Hotword Detection <https://snowboy.kitt.ai/>`__ (works offline)
* `Tensorflow <https://www.tensorflow.org/>`__
* `Vosk API <https://github.com/alphacep/vosk-api/>`__ (works offline)
* `OpenAI whisper <https://github.com/openai/whisper>`__ (works offline)
* `OpenAI Whisper API <https://platform.openai.com/docs/guides/speech-to-text>`__
* `Groq Whisper API <https://console.groq.com/docs/speech-text>`__
**Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.
To quickly try it out, run ``python -m speech_recognition`` after installing.
Project links:
- `PyPI <https://pypi.python.org/pypi/SpeechRecognition/>`__
- `Source code <https://github.com/Uberi/speech_recognition>`__
- `Issue tracker <https://github.com/Uberi/speech_recognition/issues>`__
Library Reference
-----------------
The `library reference <https://github.com/Uberi/speech_recognition/blob/master/reference/library-reference.rst>`__ documents every publicly accessible object in the library. This document is also included under ``reference/library-reference.rst``.
See `Notes on using PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing languages, compiling PocketSphinx, and building language packs from online resources. This document is also included under ``reference/pocketsphinx.rst``.
You have to install Vosk models for using Vosk. `Here <https://alphacephei.com/vosk/models>`__ are models avaiable. You have to place them in models folder of your project, like "your-project-folder/models/your-vosk-model"
Examples
--------
See the ``examples/`` `directory <https://github.com/Uberi/speech_recognition/tree/master/examples>`__ in the repository root for usage examples:
- `Recognize speech input from the microphone <https://github.com/Uberi/speech_recognition/blob/master/examples/microphone_recognition.py>`__
- `Transcribe an audio file <https://github.com/Uberi/speech_recognition/blob/master/examples/audio_transcribe.py>`__
- `Save audio data to an audio file <https://github.com/Uberi/speech_recognition/blob/master/examples/write_audio.py>`__
- `Show extended recognition results <https://github.com/Uberi/speech_recognition/blob/master/examples/extended_results.py>`__
- `Calibrate the recognizer energy threshold for ambient noise levels <https://github.com/Uberi/speech_recognition/blob/master/examples/calibrate_energy_threshold.py>`__ (see ``recognizer_instance.energy_threshold`` for details)
- `Listening to a microphone in the background <https://github.com/Uberi/speech_recognition/blob/master/examples/background_listening.py>`__
- `Various other useful recognizer features <https://github.com/Uberi/speech_recognition/blob/master/examples/special_recognizer_features.py>`__
Installing
----------
First, make sure you have all the requirements listed in the "Requirements" section.
The easiest way to install this is using ``pip install SpeechRecognition``.
Otherwise, download the source distribution from `PyPI <https://pypi.python.org/pypi/SpeechRecognition/>`__, and extract the archive.
In the folder, run ``python setup.py install``.
Requirements
------------
To use all of the functionality of the library, you should have:
* **Python** 3.9+ (required)
* **PyAudio** 0.2.11+ (required only if you need to use microphone input, ``Microphone``)
* **PocketSphinx** (required only if you need to use the Sphinx recognizer, ``recognizer_instance.recognize_sphinx``)
* **Google API Client Library for Python** (required only if you need to use the Google Cloud Speech API, ``recognizer_instance.recognize_google_cloud``)
* **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X)
* **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``)
* **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``)
* **Faster Whisper** (required only if you need to use Faster Whisper ``recognizer_instance.recognize_faster_whisper``)
* **openai** (required only if you need to use OpenAI Whisper API speech recognition ``recognizer_instance.recognize_openai``)
* **groq** (required only if you need to use Groq Whisper API speech recognition ``recognizer_instance.recognize_groq``)
The following requirements are optional, but can improve or extend functionality in some situations:
* If using CMU Sphinx, you may want to `install additional language packs <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst#installing-other-languages>`__ to support languages like International French or Mandarin Chinese.
The following sections go over the details of each requirement.
Python
~~~~~~
The first software requirement is `Python 3.9+ <https://www.python.org/downloads/>`__. This is required to use the library.
PyAudio (for microphone users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
`PyAudio <http://people.csail.mit.edu/hubert/pyaudio/#downloads>`__ is required if and only if you want to use microphone input (``Microphone``). PyAudio version 0.2.11+ is required, as earlier versions have known memory management bugs when recording from microphones in certain situations.
If not installed, everything in the library will still work, except attempting to instantiate a ``Microphone`` object will raise an ``AttributeError``.
The installation instructions on the PyAudio website are quite good - for convenience, they are summarized below:
* On Windows, install with PyAudio using `Pip <https://pip.readthedocs.org/>`__: execute ``pip install SpeechRecognition[audio]`` in a terminal.
* On Debian-derived Linux distributions (like Ubuntu and Mint), install PyAudio using `APT <https://wiki.debian.org/Apt>`__: execute ``sudo apt-get install python-pyaudio python3-pyaudio`` in a terminal.
* If the version in the repositories is too old, install the latest release using Pip: execute ``sudo apt-get install portaudio19-dev python-all-dev python3-all-dev && sudo pip install SpeechRecognition[audio]`` (replace ``pip`` with ``pip3`` if using Python 3).
* On OS X, install PortAudio using `Homebrew <http://brew.sh/>`__: ``brew install portaudio``. Then, install with PyAudio using `Pip <https://pip.readthedocs.org/>`__: ``pip install SpeechRecognition[audio]``.
* On other POSIX-based systems, install the ``portaudio19-dev`` and ``python-all-dev`` (or ``python3-all-dev`` if using Python 3) packages (or their closest equivalents) using a package manager of your choice, and then install with PyAudio using `Pip <https://pip.readthedocs.org/>`__: ``pip install SpeechRecognition[audio]`` (replace ``pip`` with ``pip3`` if using Python 3).
PocketSphinx (for Sphinx users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
`PocketSphinx <https://github.com/cmusphinx/pocketsphinx>`__ is **required if and only if you want to use the Sphinx recognizer** (``recognizer_instance.recognize_sphinx``).
On Linux and other POSIX systems (such as OS X), run ``pip install SpeechRecognition[pocketsphinx]``. Follow the instructions under "Building PocketSphinx-Python from source" in `Notes on using PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for installation instructions.
Note that the versions available in most package repositories are outdated and will not work with the bundled language data. Using the bundled wheel packages or building from source is recommended.
See `Notes on using PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing languages, compiling PocketSphinx, and building language packs from online resources. This document is also included under ``reference/pocketsphinx.rst``.
Vosk (for Vosk users)
~~~~~~~~~~~~~~~~~~~~~
Vosk API is **required if and only if you want to use Vosk recognizer** (``recognizer_instance.recognize_vosk``).
You can install it with ``python3 -m pip install vosk``.
You also have to install Vosk Models:
`Here <https://alphacephei.com/vosk/models>`__ are models avaiable for download. You have to place them in models folder of your project, like "your-project-folder/models/your-vosk-model"
Google Cloud Speech Library for Python (for Google Cloud Speech-to-Text API users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The library `google-cloud-speech <https://pypi.org/project/google-cloud-speech/>`__ is **required if and only if you want to use Google Cloud Speech-to-Text API** (``recognizer_instance.recognize_google_cloud``).
You can install it with ``python3 -m pip install SpeechRecognition[google-cloud]``.
(ref: `official installation instructions <https://cloud.google.com/speech-to-text/docs/transcribe-client-libraries#client-libraries-install-python>`__)
**Prerequisite**: Create local authentication credentials for your Google account
* Digest: `Before you begin (Transcribe speech to text by using client libraries) <https://cloud.google.com/speech-to-text/docs/transcribe-client-libraries#before-you-begin>`__
* `Set up Speech-to-Text <https://cloud.google.com/speech-to-text/docs/before-you-begin>`__
* `User credentials (Set up ADC for a local development environment) <https://cloud.google.com/docs/authentication/set-up-adc-local-dev-environment#local-user-cred>`__
Currently only `V1 <https://cloud.google.com/speech-to-text/docs/quickstart>`__ is supported. (`V2 <https://cloud.google.com/speech-to-text/v2/docs/quickstart>`__ is not supported)
FLAC (for some systems)
~~~~~~~~~~~~~~~~~~~~~~~
A `FLAC encoder <https://xiph.org/flac/>`__ is required to encode the audio data to send to the API. If using Windows (x86 or x86-64), OS X (Intel Macs only, OS X 10.6 or higher), or Linux (x86 or x86-64), this is **already bundled with this library - you do not need to install anything**.
Otherwise, ensure that you have the ``flac`` command line tool, which is often available through the system package manager. For example, this would usually be ``sudo apt-get install flac`` on Debian-derivatives, or ``brew install flac`` on OS X with Homebrew.
Whisper (for Whisper users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Whisper is **required if and only if you want to use whisper** (``recognizer_instance.recognize_whisper``).
You can install it with ``python3 -m pip install SpeechRecognition[whisper-local]``.
Faster Whisper (for Faster Whisper users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The library `faster-whisper <https://pypi.org/project/faster-whisper/>`__ is **required if and only if you want to use Faster Whisper** (``recognizer_instance.recognize_faster_whisper``).
You can install it with ``python3 -m pip install SpeechRecognition[faster-whisper]``.
OpenAI Whisper API (for OpenAI Whisper API users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The library `openai <https://pypi.org/project/openai/>`__ is **required if and only if you want to use OpenAI Whisper API** (``recognizer_instance.recognize_openai``).
You can install it with ``python3 -m pip install SpeechRecognition[openai]``.
Please set the environment variable ``OPENAI_API_KEY`` before calling ``recognizer_instance.recognize_openai``.
Groq Whisper API (for Groq Whisper API users)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The library `groq <https://pypi.org/project/groq/>`__ is **required if and only if you want to use Groq Whisper API** (``recognizer_instance.recognize_groq``).
You can install it with ``python3 -m pip install SpeechRecognition[groq]``.
Please set the environment variable ``GROQ_API_KEY`` before calling ``recognizer_instance.recognize_groq``.
Troubleshooting
---------------
The recognizer tries to recognize speech even when I'm not speaking, or after I'm done speaking.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Try increasing the ``recognizer_instance.energy_threshold`` property. This is basically how sensitive the recognizer is to when recognition should start. Higher values mean that it will be less sensitive, which is useful if you are in a loud room.
This value depends entirely on your microphone or audio data. There is no one-size-fits-all value, but good values typically range from 50 to 4000.
Also, check on your microphone volume settings. If it is too sensitive, the microphone may be picking up a lot of ambient noise. If it is too insensitive, the microphone may be rejecting speech as just noise.
The recognizer can't recognize speech right after it starts listening for the first time.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The ``recognizer_instance.energy_threshold`` property is probably set to a value that is too high to start off with, and then being adjusted lower automatically by dynamic energy threshold adjustment. Before it is at a good level, the energy threshold is so high that speech is just considered ambient noise.
The solution is to decrease this threshold, or call ``recognizer_instance.adjust_for_ambient_noise`` beforehand, which will set the threshold to a good value automatically.
The recognizer doesn't understand my particular language/dialect.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Try setting the recognition language to your language/dialect. To do this, see the documentation for ``recognizer_instance.recognize_sphinx``, ``recognizer_instance.recognize_google``, ``recognizer_instance.recognize_wit``, ``recognizer_instance.recognize_bing``, ``recognizer_instance.recognize_api``, ``recognizer_instance.recognize_houndify``, and ``recognizer_instance.recognize_ibm``.
For example, if your language/dialect is British English, it is better to use ``"en-GB"`` as the language rather than ``"en-US"``.
The recognizer hangs on ``recognizer_instance.listen``; specifically, when it's calling ``Microphone.MicrophoneStream.read``.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This usually happens when you're using a Raspberry Pi board, which doesn't have audio input capabilities by itself. This causes the default microphone used by PyAudio to simply block when we try to read it. If you happen to be using a Raspberry Pi, you'll need a USB sound card (or USB microphone).
Once you do this, change all instances of ``Microphone()`` to ``Microphone(device_index=MICROPHONE_INDEX)``, where ``MICROPHONE_INDEX`` is the hardware-specific index of the microphone.
To figure out what the value of ``MICROPHONE_INDEX`` should be, run the following code:
.. code:: python
import speech_recognition as sr
for index, name in enumerate(sr.Microphone.list_microphone_names()):
print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name))
This will print out something like the following:
::
Microphone with name "HDA Intel HDMI: 0 (hw:0,3)" found for `Microphone(device_index=0)`
Microphone with name "HDA Intel HDMI: 1 (hw:0,7)" found for `Microphone(device_index=1)`
Microphone with name "HDA Intel HDMI: 2 (hw:0,8)" found for `Microphone(device_index=2)`
Microphone with name "Blue Snowball: USB Audio (hw:1,0)" found for `Microphone(device_index=3)`
Microphone with name "hdmi" found for `Microphone(device_index=4)`
Microphone with name "pulse" found for `Microphone(device_index=5)`
Microphone with name "default" found for `Microphone(device_index=6)`
Now, to use the Snowball microphone, you would change ``Microphone()`` to ``Microphone(device_index=3)``.
Calling ``Microphone()`` gives the error ``IOError: No Default Input Device Available``.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
As the error says, the program doesn't know which microphone to use.
To proceed, either use ``Microphone(device_index=MICROPHONE_INDEX, ...)`` instead of ``Microphone(...)``, or set a default microphone in your OS. You can obtain possible values of ``MICROPHONE_INDEX`` using the code in the troubleshooting entry right above this one.
The program doesn't run when compiled with `PyInstaller <https://github.com/pyinstaller/pyinstaller/wiki>`__.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
As of PyInstaller version 3.0, SpeechRecognition is supported out of the box. If you're getting weird issues when compiling your program using PyInstaller, simply update PyInstaller.
You can easily do this by running ``pip install --upgrade pyinstaller``.
On Ubuntu/Debian, I get annoying output in the terminal saying things like "bt_audio_service_open: [...] Connection refused" and various others.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The "bt_audio_service_open" error means that you have a Bluetooth audio device, but as a physical device is not currently connected, we can't actually use it - if you're not using a Bluetooth microphone, then this can be safely ignored. If you are, and audio isn't working, then double check to make sure your microphone is actually connected. There does not seem to be a simple way to disable these messages.
For errors of the form "ALSA lib [...] Unknown PCM", see `this StackOverflow answer <http://stackoverflow.com/questions/7088672/pyaudio-working-but-spits-out-error-messages-each-time>`__. Basically, to get rid of an error of the form "Unknown PCM cards.pcm.rear", simply comment out ``pcm.rear cards.pcm.rear`` in ``/usr/share/alsa/alsa.conf``, ``~/.asoundrc``, and ``/etc/asound.conf``.
For "jack server is not running or cannot be started" or "connect(2) call to /dev/shm/jack-1000/default/jack_0 failed (err=No such file or directory)" or "attempt to connect to server failed", these are caused by ALSA trying to connect to JACK, and can be safely ignored. I'm not aware of any simple way to turn those messages off at this time, besides `entirely disabling printing while starting the microphone <https://github.com/Uberi/speech_recognition/issues/182#issuecomment-266256337>`__.
On OS X, I get a ``ChildProcessError`` saying that it couldn't find the system FLAC converter, even though it's installed.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Installing `FLAC for OS X <https://xiph.org/flac/download.html>`__ directly from the source code will not work, since it doesn't correctly add the executables to the search path.
Installing FLAC using `Homebrew <http://brew.sh/>`__ ensures that the search path is correctly updated. First, ensure you have Homebrew, then run ``brew install flac`` to install the necessary files.
Developing
----------
To hack on this library, first make sure you have all the requirements listed in the "Requirements" section.
- Most of the library code lives in ``speech_recognition/__init__.py``.
- Examples live under the ``examples/`` `directory <https://github.com/Uberi/speech_recognition/tree/master/examples>`__, and the demo script lives in ``speech_recognition/__main__.py``.
- The FLAC encoder binaries are in the ``speech_recognition/`` `directory <https://github.com/Uberi/speech_recognition/tree/master/speech_recognition>`__.
- Documentation can be found in the ``reference/`` `directory <https://github.com/Uberi/speech_recognition/tree/master/reference>`__.
- Third-party libraries, utilities, and reference material are in the ``third-party/`` `directory <https://github.com/Uberi/speech_recognition/tree/master/third-party>`__.
To install/reinstall the library locally, run ``python -m pip install -e .[dev]`` in the project `root directory <https://github.com/Uberi/speech_recognition>`__.
Before a release, the version number is bumped in ``README.rst`` and ``speech_recognition/__init__.py``. Version tags are then created using ``git config gpg.program gpg2 && git config user.signingkey DB45F6C431DE7C2DCD99FF7904882258A4063489 && git tag -s VERSION_GOES_HERE -m "Version VERSION_GOES_HERE"``.
Releases are done by running ``make-release.sh VERSION_GOES_HERE`` to build the Python source packages, sign them, and upload them to PyPI.
Testing
~~~~~~~
Prerequisite: `Install pipx <https://pipx.pypa.io/stable/installation/>`__.
To run all the tests:
.. code:: bash
python -m unittest discover --verbose
To run static analysis:
.. code:: bash
make lint
To ensure RST is well-formed:
.. code:: bash
make rstcheck
Testing is also done automatically by GitHub Actions, upon every push.
FLAC Executables
~~~~~~~~~~~~~~~~
The included ``flac-win32`` executable is the `official FLAC 1.3.2 32-bit Windows binary <http://downloads.xiph.org/releases/flac/flac-1.3.2-win.zip>`__.
The included ``flac-linux-x86`` and ``flac-linux-x86_64`` executables are built from the `FLAC 1.3.2 source code <http://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz>`__ with `Manylinux <https://github.com/pypa/manylinux>`__ to ensure that it's compatible with a wide variety of distributions.
The built FLAC executables should be bit-for-bit reproducible. To rebuild them, run the following inside the project directory on a Debian-like system:
.. code:: bash
# download and extract the FLAC source code
cd third-party
sudo apt-get install --yes docker.io
# build FLAC inside the Manylinux i686 Docker image
tar xf flac-1.3.2.tar.xz
sudo docker run --tty --interactive --rm --volume "$(pwd):/root" quay.io/pypa/manylinux1_i686:latest bash
cd /root/flac-1.3.2
./configure LDFLAGS=-static # compiler flags to make a static build
make
exit
cp flac-1.3.2/src/flac/flac ../speech_recognition/flac-linux-x86 && sudo rm -rf flac-1.3.2/
# build FLAC inside the Manylinux x86_64 Docker image
tar xf flac-1.3.2.tar.xz
sudo docker run --tty --interactive --rm --volume "$(pwd):/root" quay.io/pypa/manylinux1_x86_64:latest bash
cd /root/flac-1.3.2
./configure LDFLAGS=-static # compiler flags to make a static build
make
exit
cp flac-1.3.2/src/flac/flac ../speech_recognition/flac-linux-x86_64 && sudo rm -r flac-1.3.2/
The included ``flac-mac`` executable is extracted from `xACT 2.39 <http://xact.scottcbrown.org/>`__, which is a frontend for FLAC 1.3.2 that conveniently includes binaries for all of its encoders. Specifically, it is a copy of ``xACT 2.39/xACT.app/Contents/Resources/flac`` in ``xACT2.39.zip``.
Authors
-------
::
Uberi <me@anthonyz.ca> (Anthony Zhang)
bobsayshilol
arvindch <achembarpu@gmail.com> (Arvind Chembarpu)
kevinismith <kevin_i_smith@yahoo.com> (Kevin Smith)
haas85
DelightRun <changxu.mail@gmail.com>
maverickagm
kamushadenes <kamushadenes@hyadesinc.com> (Kamus Hadenes)
sbraden <braden.sarah@gmail.com> (Sarah Braden)
tb0hdan (Bohdan Turkynewych)
Thynix <steve@asksteved.com> (Steve Dougherty)
beeedy <broderick.carlin@gmail.com> (Broderick Carlin)
Please report bugs and suggestions at the `issue tracker <https://github.com/Uberi/speech_recognition/issues>`__!
How to cite this library (APA style):
Zhang, A. (2017). Speech Recognition (Version 3.11) [Software]. Available from https://github.com/Uberi/speech_recognition#readme.
How to cite this library (Chicago style):
Zhang, Anthony. 2017. *Speech Recognition* (version 3.11).
Also check out the `Python Baidu Yuyin API <https://github.com/DelightRun/PyBaiduYuyin>`__, which is based on an older version of this project, and adds support for `Baidu Yuyin <http://yuyin.baidu.com/>`__. Note that Baidu Yuyin is only available inside China.
License
-------
Copyright 2014- `Anthony Zhang (Uberi) <http://anthonyz.ca/>`__. The source code for this library is available online at `GitHub <https://github.com/Uberi/speech_recognition>`__.
SpeechRecognition is made available under the 3-clause BSD license. See ``LICENSE.txt`` in the project's `root directory <https://github.com/Uberi/speech_recognition>`__ for more information.
For convenience, all the official distributions of SpeechRecognition already include a copy of the necessary copyright notices and licenses. In your project, you can simply **say that licensing information for SpeechRecognition can be found within the SpeechRecognition README, and make sure SpeechRecognition is visible to users if they wish to see it**.
SpeechRecognition distributes language files from `CMU Sphinx <http://cmusphinx.sourceforge.net/>`__. These files are BSD-licensed and redistributable as long as copyright notices are correctly retained. See ``speech_recognition/pocketsphinx-data/*/LICENSE*.txt`` for license details for individual parts.
SpeechRecognition distributes binaries from `FLAC <https://xiph.org/flac/>`__ - ``speech_recognition/flac-win32.exe``, ``speech_recognition/flac-linux-x86``, and ``speech_recognition/flac-mac``. These files are GPLv2-licensed and redistributable, as long as the terms of the GPL are satisfied. The FLAC binaries are an `aggregate <https://www.gnu.org/licenses/gpl-faq.html#MereAggregation>`__ of `separate programs <https://www.gnu.org/licenses/gpl-faq.html#NFUseGPLPlugins>`__, so these GPL restrictions do not apply to the library or your programs that use the library, only to FLAC itself. See ``LICENSE-FLAC.txt`` for license details.

View File

@ -0,0 +1,63 @@
SpeechRecognition-3.14.2.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
SpeechRecognition-3.14.2.dist-info/LICENSE-FLAC.txt,sha256=gXf5dRMhNSbfLPYYTY_5hsZ1r7UU1OaKQEAQUhuIBkM,18092
SpeechRecognition-3.14.2.dist-info/LICENSE.txt,sha256=SqBKTm-NBIGjRpyJw3tWUIptM0QqL5MGRDN9k_JSmmU,1515
SpeechRecognition-3.14.2.dist-info/METADATA,sha256=VQmSG3yuZ4V-cbIp_dA3Jjh9VNwV9D_eKpA9l6Fh5lI,29782
SpeechRecognition-3.14.2.dist-info/RECORD,,
SpeechRecognition-3.14.2.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
SpeechRecognition-3.14.2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
SpeechRecognition-3.14.2.dist-info/top_level.txt,sha256=QKWAxpoWgbZk34beXvH8PdQwuAWYGijYCmC3owLhul0,25
speech_recognition/__init__.py,sha256=7Zqcwx6H1mZQDrwlaDPswMGLfi7iTjZ7quy-Rc8RO30,77351
speech_recognition/__main__.py,sha256=Afbp3l1yq2AW2Bsm7YBz-O0wm6gHNudgPZh69Ijjubc,833
speech_recognition/__pycache__/__init__.cpython-311.pyc,,
speech_recognition/__pycache__/__main__.cpython-311.pyc,,
speech_recognition/__pycache__/audio.cpython-311.pyc,,
speech_recognition/__pycache__/exceptions.cpython-311.pyc,,
speech_recognition/audio.py,sha256=THoN6uaHcbbwcvhW6vzkBpAZkpP8s-ouClba48KbOkw,14824
speech_recognition/exceptions.py,sha256=LnKmutOVQ6RSMcSO1Ji5Af-UZN-Aqrq4petMgBFXkKs,273
speech_recognition/flac-linux-x86,sha256=FOUk-MAmqO11z7GmT4TyHIJnRZaDXZaBVsh5yfP1k8g,1899154
speech_recognition/flac-linux-x86_64,sha256=0k6-i9XM2vxk9SKmKuL-0_t0ARrf7ts9Peqmu0YR34Q,2396644
speech_recognition/flac-mac,sha256=2LyQYpHz0QJGWm_dnNfJoIOYsDsPI8b-JwZAVns4o6E,451168
speech_recognition/flac-win32.exe,sha256=D8yWtMDceZCrDDnb-8gvrtNmxYiPD0oz9GTqO3m38IU,738816
speech_recognition/pocketsphinx-data/en-US/LICENSE.txt,sha256=_PXE5BroH3BO1w8VERRicu8PDKe5cIjRHXKJa1ahifU,1537
speech_recognition/pocketsphinx-data/en-US/acoustic-model/README,sha256=i4jemAVoUJxkbQUnuEFL7vE2lkORkDtAmW0y9ze_dS4,1617
speech_recognition/pocketsphinx-data/en-US/acoustic-model/feat.params,sha256=ioWtKGlsfyNDRE52M84w8WX-2HQPghlUVExvvUb0lR8,165
speech_recognition/pocketsphinx-data/en-US/acoustic-model/mdef,sha256=I2D5qGiJwc_ui9YYoCaTh5EeX7KSCllPUGsYuMeWg7A,2959176
speech_recognition/pocketsphinx-data/en-US/acoustic-model/means,sha256=gyAZ4yysEusxiWT5b0aQNKyxLQNI7t3DgxgxoQDLTdQ,838732
speech_recognition/pocketsphinx-data/en-US/acoustic-model/noisedict,sha256=cpWwffLCBMT4fGeCtr4aOFnXAG1OOGQYHJVdbasQWjM,56
speech_recognition/pocketsphinx-data/en-US/acoustic-model/sendump,sha256=jJVkwNW-9pyp2b8QFKvhYvBxZEzwLPH6ikg8PcFlp6g,1969024
speech_recognition/pocketsphinx-data/en-US/acoustic-model/transition_matrices,sha256=wffyjqQxd75zS-H4i9fxuahT0OZg-FmcZ8bq7Ki7U5o,2080
speech_recognition/pocketsphinx-data/en-US/acoustic-model/variances,sha256=sA1pb4XpaDT8EPjl8GQo2MTba__b5YRbb2m_bvvEj6U,838732
speech_recognition/pocketsphinx-data/en-US/language-model.lm.bin,sha256=Ttj1LtBBMEXw6cZaRwmVOJa2_5qZ68EPfvkm9mLF5QY,29208442
speech_recognition/pocketsphinx-data/en-US/pronounciation-dictionary.dict,sha256=zjv6Nk45C35Ip0g9NQpI2fFhEn9oQoucb5ZygR_BE08,3240807
speech_recognition/recognizers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
speech_recognition/recognizers/__pycache__/__init__.cpython-311.pyc,,
speech_recognition/recognizers/__pycache__/google.cpython-311.pyc,,
speech_recognition/recognizers/__pycache__/google_cloud.cpython-311.pyc,,
speech_recognition/recognizers/__pycache__/pocketsphinx.cpython-311.pyc,,
speech_recognition/recognizers/google.py,sha256=bSgF7Vxmjt8nOsDt3QMsJ_ecPYM22irTXlFjtEavp8Y,10149
speech_recognition/recognizers/google_cloud.py,sha256=-07GBAfp---EJ6RIjlbcQoDUIxDmDHNOJhJFgt3-z1Q,6101
speech_recognition/recognizers/pocketsphinx.py,sha256=Y8fw4mPJusFk_Xhrlzh82xLFNGLlfJzhWCWPTGy4yvE,7349
speech_recognition/recognizers/whisper_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
speech_recognition/recognizers/whisper_api/__pycache__/__init__.cpython-311.pyc,,
speech_recognition/recognizers/whisper_api/__pycache__/base.cpython-311.pyc,,
speech_recognition/recognizers/whisper_api/__pycache__/groq.cpython-311.pyc,,
speech_recognition/recognizers/whisper_api/__pycache__/openai.cpython-311.pyc,,
speech_recognition/recognizers/whisper_api/base.py,sha256=lkONeDe3ec6Z5qt0gRxOijyB-UUu7n_uwJ64GVdtJ5s,679
speech_recognition/recognizers/whisper_api/groq.py,sha256=UmzcfNhMLrVd0QyvsypcpZk1GtWDzvdt9h2Y-m1Oe1o,1638
speech_recognition/recognizers/whisper_api/openai.py,sha256=vcGF3qkbU0ABrL6jl8mqCaycwd2Fx2dFfIA8DbQ1VqA,2566
speech_recognition/recognizers/whisper_local/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
speech_recognition/recognizers/whisper_local/__pycache__/__init__.cpython-311.pyc,,
speech_recognition/recognizers/whisper_local/__pycache__/base.cpython-311.pyc,,
speech_recognition/recognizers/whisper_local/__pycache__/faster_whisper.cpython-311.pyc,,
speech_recognition/recognizers/whisper_local/__pycache__/whisper.cpython-311.pyc,,
speech_recognition/recognizers/whisper_local/base.py,sha256=W0Q14-VpzS6yGu_vkg1EPN2SQ-RnbykdEKJiW-2_Lis,1265
speech_recognition/recognizers/whisper_local/faster_whisper.py,sha256=LRWJj0yrHncMcYhAjHQjJyIuhsPTFVP-JrtXdkRzdrg,3252
speech_recognition/recognizers/whisper_local/whisper.py,sha256=tGVrErg8jnC7m7QvACZDB3lanhOeyUjX-XPxPvaTozk,3340
tests/__init__.py,sha256=nWC1VOR8uKeCB0MkEfbl0uCsUZKDoqTHhSNyYyVSzUo,133
tests/__pycache__/__init__.cpython-311.pyc,,
tests/__pycache__/test_audio.cpython-311.pyc,,
tests/__pycache__/test_recognition.cpython-311.pyc,,
tests/__pycache__/test_special_features.cpython-311.pyc,,
tests/test_audio.py,sha256=eN86PHiVbD_0n85vxcQBwsFvsEemaEWdLXDtMgp93BM,9591
tests/test_recognition.py,sha256=Kg4sN1UZ-Y4b8H1M5_nVwe7D9DoOHC0hiQlEKkV9MR0,5139
tests/test_special_features.py,sha256=yC3NUu6VPZ6TCf40QxvxceJgbQ1lJpxjQyoSzGhUa_0,1525

View File

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: setuptools (75.3.0)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1,2 @@
speech_recognition
tests

View File

@ -0,0 +1,222 @@
# don't import any costly modules
import sys
import os
is_pypy = '__pypy__' in sys.builtin_module_names
def warn_distutils_present():
if 'distutils' not in sys.modules:
return
if is_pypy and sys.version_info < (3, 7):
# PyPy for 3.6 unconditionally imports distutils, so bypass the warning
# https://foss.heptapod.net/pypy/pypy/-/blob/be829135bc0d758997b3566062999ee8b23872b4/lib-python/3/site.py#L250
return
import warnings
warnings.warn(
"Distutils was imported before Setuptools, but importing Setuptools "
"also replaces the `distutils` module in `sys.modules`. This may lead "
"to undesirable behaviors or errors. To avoid these issues, avoid "
"using distutils directly, ensure that setuptools is installed in the "
"traditional way (e.g. not an editable install), and/or make sure "
"that setuptools is always imported before distutils."
)
def clear_distutils():
if 'distutils' not in sys.modules:
return
import warnings
warnings.warn("Setuptools is replacing distutils.")
mods = [
name
for name in sys.modules
if name == "distutils" or name.startswith("distutils.")
]
for name in mods:
del sys.modules[name]
def enabled():
"""
Allow selection of distutils by environment variable.
"""
which = os.environ.get('SETUPTOOLS_USE_DISTUTILS', 'local')
return which == 'local'
def ensure_local_distutils():
import importlib
clear_distutils()
# With the DistutilsMetaFinder in place,
# perform an import to cause distutils to be
# loaded from setuptools._distutils. Ref #2906.
with shim():
importlib.import_module('distutils')
# check that submodules load as expected
core = importlib.import_module('distutils.core')
assert '_distutils' in core.__file__, core.__file__
assert 'setuptools._distutils.log' not in sys.modules
def do_override():
"""
Ensure that the local copy of distutils is preferred over stdlib.
See https://github.com/pypa/setuptools/issues/417#issuecomment-392298401
for more motivation.
"""
if enabled():
warn_distutils_present()
ensure_local_distutils()
class _TrivialRe:
def __init__(self, *patterns):
self._patterns = patterns
def match(self, string):
return all(pat in string for pat in self._patterns)
class DistutilsMetaFinder:
def find_spec(self, fullname, path, target=None):
# optimization: only consider top level modules and those
# found in the CPython test suite.
if path is not None and not fullname.startswith('test.'):
return
method_name = 'spec_for_{fullname}'.format(**locals())
method = getattr(self, method_name, lambda: None)
return method()
def spec_for_distutils(self):
if self.is_cpython():
return
import importlib
import importlib.abc
import importlib.util
try:
mod = importlib.import_module('setuptools._distutils')
except Exception:
# There are a couple of cases where setuptools._distutils
# may not be present:
# - An older Setuptools without a local distutils is
# taking precedence. Ref #2957.
# - Path manipulation during sitecustomize removes
# setuptools from the path but only after the hook
# has been loaded. Ref #2980.
# In either case, fall back to stdlib behavior.
return
class DistutilsLoader(importlib.abc.Loader):
def create_module(self, spec):
mod.__name__ = 'distutils'
return mod
def exec_module(self, module):
pass
return importlib.util.spec_from_loader(
'distutils', DistutilsLoader(), origin=mod.__file__
)
@staticmethod
def is_cpython():
"""
Suppress supplying distutils for CPython (build and tests).
Ref #2965 and #3007.
"""
return os.path.isfile('pybuilddir.txt')
def spec_for_pip(self):
"""
Ensure stdlib distutils when running under pip.
See pypa/pip#8761 for rationale.
"""
if self.pip_imported_during_build():
return
clear_distutils()
self.spec_for_distutils = lambda: None
@classmethod
def pip_imported_during_build(cls):
"""
Detect if pip is being imported in a build script. Ref #2355.
"""
import traceback
return any(
cls.frame_file_is_setup(frame) for frame, line in traceback.walk_stack(None)
)
@staticmethod
def frame_file_is_setup(frame):
"""
Return True if the indicated frame suggests a setup.py file.
"""
# some frames may not have __file__ (#2940)
return frame.f_globals.get('__file__', '').endswith('setup.py')
def spec_for_sensitive_tests(self):
"""
Ensure stdlib distutils when running select tests under CPython.
python/cpython#91169
"""
clear_distutils()
self.spec_for_distutils = lambda: None
sensitive_tests = (
[
'test.test_distutils',
'test.test_peg_generator',
'test.test_importlib',
]
if sys.version_info < (3, 10)
else [
'test.test_distutils',
]
)
for name in DistutilsMetaFinder.sensitive_tests:
setattr(
DistutilsMetaFinder,
f'spec_for_{name}',
DistutilsMetaFinder.spec_for_sensitive_tests,
)
DISTUTILS_FINDER = DistutilsMetaFinder()
def add_shim():
DISTUTILS_FINDER in sys.meta_path or insert_shim()
class shim:
def __enter__(self):
insert_shim()
def __exit__(self, exc, value, tb):
remove_shim()
def insert_shim():
sys.meta_path.insert(0, DISTUTILS_FINDER)
def remove_shim():
try:
sys.meta_path.remove(DISTUTILS_FINDER)
except ValueError:
pass

View File

@ -0,0 +1 @@
__import__('_distutils_hack').do_override()

View File

@ -0,0 +1,20 @@
Copyright 2010 Jason Kirtland
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -0,0 +1,60 @@
Metadata-Version: 2.3
Name: blinker
Version: 1.9.0
Summary: Fast, simple object-to-object and broadcast signaling
Author: Jason Kirtland
Maintainer-email: Pallets Ecosystem <contact@palletsprojects.com>
Requires-Python: >=3.9
Description-Content-Type: text/markdown
Classifier: Development Status :: 5 - Production/Stable
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python
Classifier: Typing :: Typed
Project-URL: Chat, https://discord.gg/pallets
Project-URL: Documentation, https://blinker.readthedocs.io
Project-URL: Source, https://github.com/pallets-eco/blinker/
# Blinker
Blinker provides a fast dispatching system that allows any number of
interested parties to subscribe to events, or "signals".
## Pallets Community Ecosystem
> [!IMPORTANT]\
> This project is part of the Pallets Community Ecosystem. Pallets is the open
> source organization that maintains Flask; Pallets-Eco enables community
> maintenance of related projects. If you are interested in helping maintain
> this project, please reach out on [the Pallets Discord server][discord].
>
> [discord]: https://discord.gg/pallets
## Example
Signal receivers can subscribe to specific senders or receive signals
sent by any sender.
```pycon
>>> from blinker import signal
>>> started = signal('round-started')
>>> def each(round):
... print(f"Round {round}")
...
>>> started.connect(each)
>>> def round_two(round):
... print("This is round two.")
...
>>> started.connect(round_two, sender=2)
>>> for round in range(1, 4):
... started.send(round)
...
Round 1!
Round 2!
This is round two.
Round 3!
```

View File

@ -0,0 +1,12 @@
blinker-1.9.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
blinker-1.9.0.dist-info/LICENSE.txt,sha256=nrc6HzhZekqhcCXSrhvjg5Ykx5XphdTw6Xac4p-spGc,1054
blinker-1.9.0.dist-info/METADATA,sha256=uIRiM8wjjbHkCtbCyTvctU37IAZk0kEe5kxAld1dvzA,1633
blinker-1.9.0.dist-info/RECORD,,
blinker-1.9.0.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
blinker/__init__.py,sha256=I2EdZqpy4LyjX17Hn1yzJGWCjeLaVaPzsMgHkLfj_cQ,317
blinker/__pycache__/__init__.cpython-311.pyc,,
blinker/__pycache__/_utilities.cpython-311.pyc,,
blinker/__pycache__/base.cpython-311.pyc,,
blinker/_utilities.py,sha256=0J7eeXXTUx0Ivf8asfpx0ycVkp0Eqfqnj117x2mYX9E,1675
blinker/base.py,sha256=QpDuvXXcwJF49lUBcH5BiST46Rz9wSG7VW_p7N_027M,19132
blinker/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@ -0,0 +1,4 @@
Wheel-Version: 1.0
Generator: flit 3.10.1
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1,17 @@
from __future__ import annotations
from .base import ANY
from .base import default_namespace
from .base import NamedSignal
from .base import Namespace
from .base import Signal
from .base import signal
__all__ = [
"ANY",
"default_namespace",
"NamedSignal",
"Namespace",
"Signal",
"signal",
]

View File

@ -0,0 +1,64 @@
from __future__ import annotations
import collections.abc as c
import inspect
import typing as t
from weakref import ref
from weakref import WeakMethod
T = t.TypeVar("T")
class Symbol:
"""A constant symbol, nicer than ``object()``. Repeated calls return the
same instance.
>>> Symbol('foo') is Symbol('foo')
True
>>> Symbol('foo')
foo
"""
symbols: t.ClassVar[dict[str, Symbol]] = {}
def __new__(cls, name: str) -> Symbol:
if name in cls.symbols:
return cls.symbols[name]
obj = super().__new__(cls)
cls.symbols[name] = obj
return obj
def __init__(self, name: str) -> None:
self.name = name
def __repr__(self) -> str:
return self.name
def __getnewargs__(self) -> tuple[t.Any, ...]:
return (self.name,)
def make_id(obj: object) -> c.Hashable:
"""Get a stable identifier for a receiver or sender, to be used as a dict
key or in a set.
"""
if inspect.ismethod(obj):
# The id of a bound method is not stable, but the id of the unbound
# function and instance are.
return id(obj.__func__), id(obj.__self__)
if isinstance(obj, (str, int)):
# Instances with the same value always compare equal and have the same
# hash, even if the id may change.
return obj
# Assume other types are not hashable but will always be the same instance.
return id(obj)
def make_ref(obj: T, callback: c.Callable[[ref[T]], None] | None = None) -> ref[T]:
if inspect.ismethod(obj):
return WeakMethod(obj, callback) # type: ignore[arg-type, return-value]
return ref(obj, callback)

View File

@ -0,0 +1,512 @@
from __future__ import annotations
import collections.abc as c
import sys
import typing as t
import weakref
from collections import defaultdict
from contextlib import contextmanager
from functools import cached_property
from inspect import iscoroutinefunction
from ._utilities import make_id
from ._utilities import make_ref
from ._utilities import Symbol
F = t.TypeVar("F", bound=c.Callable[..., t.Any])
ANY = Symbol("ANY")
"""Symbol for "any sender"."""
ANY_ID = 0
class Signal:
"""A notification emitter.
:param doc: The docstring for the signal.
"""
ANY = ANY
"""An alias for the :data:`~blinker.ANY` sender symbol."""
set_class: type[set[t.Any]] = set
"""The set class to use for tracking connected receivers and senders.
Python's ``set`` is unordered. If receivers must be dispatched in the order
they were connected, an ordered set implementation can be used.
.. versionadded:: 1.7
"""
@cached_property
def receiver_connected(self) -> Signal:
"""Emitted at the end of each :meth:`connect` call.
The signal sender is the signal instance, and the :meth:`connect`
arguments are passed through: ``receiver``, ``sender``, and ``weak``.
.. versionadded:: 1.2
"""
return Signal(doc="Emitted after a receiver connects.")
@cached_property
def receiver_disconnected(self) -> Signal:
"""Emitted at the end of each :meth:`disconnect` call.
The sender is the signal instance, and the :meth:`disconnect` arguments
are passed through: ``receiver`` and ``sender``.
This signal is emitted **only** when :meth:`disconnect` is called
explicitly. This signal cannot be emitted by an automatic disconnect
when a weakly referenced receiver or sender goes out of scope, as the
instance is no longer be available to be used as the sender for this
signal.
An alternative approach is available by subscribing to
:attr:`receiver_connected` and setting up a custom weakref cleanup
callback on weak receivers and senders.
.. versionadded:: 1.2
"""
return Signal(doc="Emitted after a receiver disconnects.")
def __init__(self, doc: str | None = None) -> None:
if doc:
self.__doc__ = doc
self.receivers: dict[
t.Any, weakref.ref[c.Callable[..., t.Any]] | c.Callable[..., t.Any]
] = {}
"""The map of connected receivers. Useful to quickly check if any
receivers are connected to the signal: ``if s.receivers:``. The
structure and data is not part of the public API, but checking its
boolean value is.
"""
self.is_muted: bool = False
self._by_receiver: dict[t.Any, set[t.Any]] = defaultdict(self.set_class)
self._by_sender: dict[t.Any, set[t.Any]] = defaultdict(self.set_class)
self._weak_senders: dict[t.Any, weakref.ref[t.Any]] = {}
def connect(self, receiver: F, sender: t.Any = ANY, weak: bool = True) -> F:
"""Connect ``receiver`` to be called when the signal is sent by
``sender``.
:param receiver: The callable to call when :meth:`send` is called with
the given ``sender``, passing ``sender`` as a positional argument
along with any extra keyword arguments.
:param sender: Any object or :data:`ANY`. ``receiver`` will only be
called when :meth:`send` is called with this sender. If ``ANY``, the
receiver will be called for any sender. A receiver may be connected
to multiple senders by calling :meth:`connect` multiple times.
:param weak: Track the receiver with a :mod:`weakref`. The receiver will
be automatically disconnected when it is garbage collected. When
connecting a receiver defined within a function, set to ``False``,
otherwise it will be disconnected when the function scope ends.
"""
receiver_id = make_id(receiver)
sender_id = ANY_ID if sender is ANY else make_id(sender)
if weak:
self.receivers[receiver_id] = make_ref(
receiver, self._make_cleanup_receiver(receiver_id)
)
else:
self.receivers[receiver_id] = receiver
self._by_sender[sender_id].add(receiver_id)
self._by_receiver[receiver_id].add(sender_id)
if sender is not ANY and sender_id not in self._weak_senders:
# store a cleanup for weakref-able senders
try:
self._weak_senders[sender_id] = make_ref(
sender, self._make_cleanup_sender(sender_id)
)
except TypeError:
pass
if "receiver_connected" in self.__dict__ and self.receiver_connected.receivers:
try:
self.receiver_connected.send(
self, receiver=receiver, sender=sender, weak=weak
)
except TypeError:
# TODO no explanation or test for this
self.disconnect(receiver, sender)
raise
return receiver
def connect_via(self, sender: t.Any, weak: bool = False) -> c.Callable[[F], F]:
"""Connect the decorated function to be called when the signal is sent
by ``sender``.
The decorated function will be called when :meth:`send` is called with
the given ``sender``, passing ``sender`` as a positional argument along
with any extra keyword arguments.
:param sender: Any object or :data:`ANY`. ``receiver`` will only be
called when :meth:`send` is called with this sender. If ``ANY``, the
receiver will be called for any sender. A receiver may be connected
to multiple senders by calling :meth:`connect` multiple times.
:param weak: Track the receiver with a :mod:`weakref`. The receiver will
be automatically disconnected when it is garbage collected. When
connecting a receiver defined within a function, set to ``False``,
otherwise it will be disconnected when the function scope ends.=
.. versionadded:: 1.1
"""
def decorator(fn: F) -> F:
self.connect(fn, sender, weak)
return fn
return decorator
@contextmanager
def connected_to(
self, receiver: c.Callable[..., t.Any], sender: t.Any = ANY
) -> c.Generator[None, None, None]:
"""A context manager that temporarily connects ``receiver`` to the
signal while a ``with`` block executes. When the block exits, the
receiver is disconnected. Useful for tests.
:param receiver: The callable to call when :meth:`send` is called with
the given ``sender``, passing ``sender`` as a positional argument
along with any extra keyword arguments.
:param sender: Any object or :data:`ANY`. ``receiver`` will only be
called when :meth:`send` is called with this sender. If ``ANY``, the
receiver will be called for any sender.
.. versionadded:: 1.1
"""
self.connect(receiver, sender=sender, weak=False)
try:
yield None
finally:
self.disconnect(receiver)
@contextmanager
def muted(self) -> c.Generator[None, None, None]:
"""A context manager that temporarily disables the signal. No receivers
will be called if the signal is sent, until the ``with`` block exits.
Useful for tests.
"""
self.is_muted = True
try:
yield None
finally:
self.is_muted = False
def send(
self,
sender: t.Any | None = None,
/,
*,
_async_wrapper: c.Callable[
[c.Callable[..., c.Coroutine[t.Any, t.Any, t.Any]]], c.Callable[..., t.Any]
]
| None = None,
**kwargs: t.Any,
) -> list[tuple[c.Callable[..., t.Any], t.Any]]:
"""Call all receivers that are connected to the given ``sender``
or :data:`ANY`. Each receiver is called with ``sender`` as a positional
argument along with any extra keyword arguments. Return a list of
``(receiver, return value)`` tuples.
The order receivers are called is undefined, but can be influenced by
setting :attr:`set_class`.
If a receiver raises an exception, that exception will propagate up.
This makes debugging straightforward, with an assumption that correctly
implemented receivers will not raise.
:param sender: Call receivers connected to this sender, in addition to
those connected to :data:`ANY`.
:param _async_wrapper: Will be called on any receivers that are async
coroutines to turn them into sync callables. For example, could run
the receiver with an event loop.
:param kwargs: Extra keyword arguments to pass to each receiver.
.. versionchanged:: 1.7
Added the ``_async_wrapper`` argument.
"""
if self.is_muted:
return []
results = []
for receiver in self.receivers_for(sender):
if iscoroutinefunction(receiver):
if _async_wrapper is None:
raise RuntimeError("Cannot send to a coroutine function.")
result = _async_wrapper(receiver)(sender, **kwargs)
else:
result = receiver(sender, **kwargs)
results.append((receiver, result))
return results
async def send_async(
self,
sender: t.Any | None = None,
/,
*,
_sync_wrapper: c.Callable[
[c.Callable[..., t.Any]], c.Callable[..., c.Coroutine[t.Any, t.Any, t.Any]]
]
| None = None,
**kwargs: t.Any,
) -> list[tuple[c.Callable[..., t.Any], t.Any]]:
"""Await all receivers that are connected to the given ``sender``
or :data:`ANY`. Each receiver is called with ``sender`` as a positional
argument along with any extra keyword arguments. Return a list of
``(receiver, return value)`` tuples.
The order receivers are called is undefined, but can be influenced by
setting :attr:`set_class`.
If a receiver raises an exception, that exception will propagate up.
This makes debugging straightforward, with an assumption that correctly
implemented receivers will not raise.
:param sender: Call receivers connected to this sender, in addition to
those connected to :data:`ANY`.
:param _sync_wrapper: Will be called on any receivers that are sync
callables to turn them into async coroutines. For example,
could call the receiver in a thread.
:param kwargs: Extra keyword arguments to pass to each receiver.
.. versionadded:: 1.7
"""
if self.is_muted:
return []
results = []
for receiver in self.receivers_for(sender):
if not iscoroutinefunction(receiver):
if _sync_wrapper is None:
raise RuntimeError("Cannot send to a non-coroutine function.")
result = await _sync_wrapper(receiver)(sender, **kwargs)
else:
result = await receiver(sender, **kwargs)
results.append((receiver, result))
return results
def has_receivers_for(self, sender: t.Any) -> bool:
"""Check if there is at least one receiver that will be called with the
given ``sender``. A receiver connected to :data:`ANY` will always be
called, regardless of sender. Does not check if weakly referenced
receivers are still live. See :meth:`receivers_for` for a stronger
search.
:param sender: Check for receivers connected to this sender, in addition
to those connected to :data:`ANY`.
"""
if not self.receivers:
return False
if self._by_sender[ANY_ID]:
return True
if sender is ANY:
return False
return make_id(sender) in self._by_sender
def receivers_for(
self, sender: t.Any
) -> c.Generator[c.Callable[..., t.Any], None, None]:
"""Yield each receiver to be called for ``sender``, in addition to those
to be called for :data:`ANY`. Weakly referenced receivers that are not
live will be disconnected and skipped.
:param sender: Yield receivers connected to this sender, in addition
to those connected to :data:`ANY`.
"""
# TODO: test receivers_for(ANY)
if not self.receivers:
return
sender_id = make_id(sender)
if sender_id in self._by_sender:
ids = self._by_sender[ANY_ID] | self._by_sender[sender_id]
else:
ids = self._by_sender[ANY_ID].copy()
for receiver_id in ids:
receiver = self.receivers.get(receiver_id)
if receiver is None:
continue
if isinstance(receiver, weakref.ref):
strong = receiver()
if strong is None:
self._disconnect(receiver_id, ANY_ID)
continue
yield strong
else:
yield receiver
def disconnect(self, receiver: c.Callable[..., t.Any], sender: t.Any = ANY) -> None:
"""Disconnect ``receiver`` from being called when the signal is sent by
``sender``.
:param receiver: A connected receiver callable.
:param sender: Disconnect from only this sender. By default, disconnect
from all senders.
"""
sender_id: c.Hashable
if sender is ANY:
sender_id = ANY_ID
else:
sender_id = make_id(sender)
receiver_id = make_id(receiver)
self._disconnect(receiver_id, sender_id)
if (
"receiver_disconnected" in self.__dict__
and self.receiver_disconnected.receivers
):
self.receiver_disconnected.send(self, receiver=receiver, sender=sender)
def _disconnect(self, receiver_id: c.Hashable, sender_id: c.Hashable) -> None:
if sender_id == ANY_ID:
if self._by_receiver.pop(receiver_id, None) is not None:
for bucket in self._by_sender.values():
bucket.discard(receiver_id)
self.receivers.pop(receiver_id, None)
else:
self._by_sender[sender_id].discard(receiver_id)
self._by_receiver[receiver_id].discard(sender_id)
def _make_cleanup_receiver(
self, receiver_id: c.Hashable
) -> c.Callable[[weakref.ref[c.Callable[..., t.Any]]], None]:
"""Create a callback function to disconnect a weakly referenced
receiver when it is garbage collected.
"""
def cleanup(ref: weakref.ref[c.Callable[..., t.Any]]) -> None:
# If the interpreter is shutting down, disconnecting can result in a
# weird ignored exception. Don't call it in that case.
if not sys.is_finalizing():
self._disconnect(receiver_id, ANY_ID)
return cleanup
def _make_cleanup_sender(
self, sender_id: c.Hashable
) -> c.Callable[[weakref.ref[t.Any]], None]:
"""Create a callback function to disconnect all receivers for a weakly
referenced sender when it is garbage collected.
"""
assert sender_id != ANY_ID
def cleanup(ref: weakref.ref[t.Any]) -> None:
self._weak_senders.pop(sender_id, None)
for receiver_id in self._by_sender.pop(sender_id, ()):
self._by_receiver[receiver_id].discard(sender_id)
return cleanup
def _cleanup_bookkeeping(self) -> None:
"""Prune unused sender/receiver bookkeeping. Not threadsafe.
Connecting & disconnecting leaves behind a small amount of bookkeeping
data. Typical workloads using Blinker, for example in most web apps,
Flask, CLI scripts, etc., are not adversely affected by this
bookkeeping.
With a long-running process performing dynamic signal routing with high
volume, e.g. connecting to function closures, senders are all unique
object instances. Doing all of this over and over may cause memory usage
to grow due to extraneous bookkeeping. (An empty ``set`` for each stale
sender/receiver pair.)
This method will prune that bookkeeping away, with the caveat that such
pruning is not threadsafe. The risk is that cleanup of a fully
disconnected receiver/sender pair occurs while another thread is
connecting that same pair. If you are in the highly dynamic, unique
receiver/sender situation that has lead you to this method, that failure
mode is perhaps not a big deal for you.
"""
for mapping in (self._by_sender, self._by_receiver):
for ident, bucket in list(mapping.items()):
if not bucket:
mapping.pop(ident, None)
def _clear_state(self) -> None:
"""Disconnect all receivers and senders. Useful for tests."""
self._weak_senders.clear()
self.receivers.clear()
self._by_sender.clear()
self._by_receiver.clear()
class NamedSignal(Signal):
"""A named generic notification emitter. The name is not used by the signal
itself, but matches the key in the :class:`Namespace` that it belongs to.
:param name: The name of the signal within the namespace.
:param doc: The docstring for the signal.
"""
def __init__(self, name: str, doc: str | None = None) -> None:
super().__init__(doc)
#: The name of this signal.
self.name: str = name
def __repr__(self) -> str:
base = super().__repr__()
return f"{base[:-1]}; {self.name!r}>" # noqa: E702
class Namespace(dict[str, NamedSignal]):
"""A dict mapping names to signals."""
def signal(self, name: str, doc: str | None = None) -> NamedSignal:
"""Return the :class:`NamedSignal` for the given ``name``, creating it
if required. Repeated calls with the same name return the same signal.
:param name: The name of the signal.
:param doc: The docstring of the signal.
"""
if name not in self:
self[name] = NamedSignal(name, doc)
return self[name]
class _PNamespaceSignal(t.Protocol):
def __call__(self, name: str, doc: str | None = None) -> NamedSignal: ...
default_namespace: Namespace = Namespace()
"""A default :class:`Namespace` for creating named signals. :func:`signal`
creates a :class:`NamedSignal` in this namespace.
"""
signal: _PNamespaceSignal = default_namespace.signal
"""Return a :class:`NamedSignal` in :data:`default_namespace` with the given
``name``, creating it if required. Repeated calls with the same name return the
same signal.
"""

View File

@ -0,0 +1,20 @@
This package contains a modified version of ca-bundle.crt:
ca-bundle.crt -- Bundle of CA Root Certificates
This is a bundle of X.509 certificates of public Certificate Authorities
(CA). These were automatically extracted from Mozilla's root certificates
file (certdata.txt). This file can be found in the mozilla source tree:
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
It contains the certificates in PEM format and therefore
can be directly used with curl / libcurl / php_curl, or with
an Apache+mod_ssl webserver for SSL client authentication.
Just configure this file as the SSLCACertificateFile.#
***** BEGIN LICENSE BLOCK *****
This Source Code Form is subject to the terms of the Mozilla Public License,
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
one at http://mozilla.org/MPL/2.0/.
***** END LICENSE BLOCK *****
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $

View File

@ -0,0 +1,68 @@
Metadata-Version: 2.1
Name: certifi
Version: 2025.1.31
Summary: Python package for providing Mozilla's CA Bundle.
Home-page: https://github.com/certifi/python-certifi
Author: Kenneth Reitz
Author-email: me@kennethreitz.com
License: MPL-2.0
Project-URL: Source, https://github.com/certifi/python-certifi
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
Classifier: Natural Language :: English
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Requires-Python: >=3.6
License-File: LICENSE
Certifi: Python SSL Certificates
================================
Certifi provides Mozilla's carefully curated collection of Root Certificates for
validating the trustworthiness of SSL certificates while verifying the identity
of TLS hosts. It has been extracted from the `Requests`_ project.
Installation
------------
``certifi`` is available on PyPI. Simply install it with ``pip``::
$ pip install certifi
Usage
-----
To reference the installed certificate authority (CA) bundle, you can use the
built-in function::
>>> import certifi
>>> certifi.where()
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
Or from the command line::
$ python -m certifi
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
Enjoy!
.. _`Requests`: https://requests.readthedocs.io/en/master/
Addition/Removal of Certificates
--------------------------------
Certifi does not support any addition/removal or other modification of the
CA trust store content. This project is intended to provide a reliable and
highly portable root of trust to python deployments. Look to upstream projects
for methods to use alternate trust.

View File

@ -0,0 +1,14 @@
certifi-2025.1.31.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
certifi-2025.1.31.dist-info/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
certifi-2025.1.31.dist-info/METADATA,sha256=l9pPyH8X-gORo4Wgl72vZd2uifJ_kLcnDwKakddIWPM,2273
certifi-2025.1.31.dist-info/RECORD,,
certifi-2025.1.31.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
certifi-2025.1.31.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
certifi/__init__.py,sha256=neIaAf7BM36ygmQCmy-ZsSyjnvjWghFeu13wwEAnjj0,94
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
certifi/__pycache__/__init__.cpython-311.pyc,,
certifi/__pycache__/__main__.cpython-311.pyc,,
certifi/__pycache__/core.cpython-311.pyc,,
certifi/cacert.pem,sha256=xVsh-Qf3-G1IrdCTVS-1ZRdJ_1-GBQjMu0I9bB-9gMc,297255
certifi/core.py,sha256=qRDDFyXVJwTB_EmoGppaXU_R9qCZvhl-EzxPMuV3nTA,4426
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0

View File

@ -0,0 +1,5 @@
Wheel-Version: 1.0
Generator: setuptools (75.3.0)
Root-Is-Purelib: true
Tag: py3-none-any

View File

@ -0,0 +1,4 @@
from .core import contents, where
__all__ = ["contents", "where"]
__version__ = "2025.01.31"

View File

@ -0,0 +1,12 @@
import argparse
from certifi import contents, where
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--contents", action="store_true")
args = parser.parse_args()
if args.contents:
print(contents())
else:
print(where())

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,114 @@
"""
certifi.py
~~~~~~~~~~
This module returns the installation location of cacert.pem or its contents.
"""
import sys
import atexit
def exit_cacert_ctx() -> None:
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
if sys.version_info >= (3, 11):
from importlib.resources import as_file, files
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the file
# in cases where we're inside of a zipimport situation until someone
# actually calls where(), but we don't want to re-extract the file
# on every call of where(), so we'll do it once then store it in a
# global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you to
# manage the cleanup of this file, so it doesn't actually return a
# path, it returns a context manager that will give you the path
# when you enter it and will do any cleanup when you leave it. In
# the common case of not needing a temporary file, it will just
# return the file system location and the __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
elif sys.version_info >= (3, 7):
from importlib.resources import path as get_path, read_text
_CACERT_CTX = None
_CACERT_PATH = None
def where() -> str:
# This is slightly terrible, but we want to delay extracting the
# file in cases where we're inside of a zipimport situation until
# someone actually calls where(), but we don't want to re-extract
# the file on every call of where(), so we'll do it once then store
# it in a global variable.
global _CACERT_CTX
global _CACERT_PATH
if _CACERT_PATH is None:
# This is slightly janky, the importlib.resources API wants you
# to manage the cleanup of this file, so it doesn't actually
# return a path, it returns a context manager that will give
# you the path when you enter it and will do any cleanup when
# you leave it. In the common case of not needing a temporary
# file, it will just return the file system location and the
# __exit__() is a no-op.
#
# We also have to hold onto the actual context manager, because
# it will do the cleanup whenever it gets garbage collected, so
# we will also store that at the global level as well.
_CACERT_CTX = get_path("certifi", "cacert.pem")
_CACERT_PATH = str(_CACERT_CTX.__enter__())
atexit.register(exit_cacert_ctx)
return _CACERT_PATH
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")
else:
import os
import types
from typing import Union
Package = Union[types.ModuleType, str]
Resource = Union[str, "os.PathLike"]
# This fallback will work for Python versions prior to 3.7 that lack the
# importlib.resources module but relies on the existing `where` function
# so won't address issues with environments like PyOxidizer that don't set
# __file__ on modules.
def read_text(
package: Package,
resource: Resource,
encoding: str = 'utf-8',
errors: str = 'strict'
) -> str:
with open(where(), encoding=encoding) as data:
return data.read()
# If we don't have importlib.resources, then we will just do the old logic
# of assuming we're on the filesystem and munge the path directly.
def where() -> str:
f = os.path.dirname(__file__)
return os.path.join(f, "cacert.pem")
def contents() -> str:
return read_text("certifi", "cacert.pem", encoding="ascii")

View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2025 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,721 @@
Metadata-Version: 2.1
Name: charset-normalizer
Version: 3.4.1
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
License: MIT
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
Project-URL: Code, https://github.com/jawah/charset_normalizer
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Classifier: Programming Language :: Python :: 3.11
Classifier: Programming Language :: Python :: 3.12
Classifier: Programming Language :: Python :: 3.13
Classifier: Programming Language :: Python :: 3 :: Only
Classifier: Programming Language :: Python :: Implementation :: CPython
Classifier: Programming Language :: Python :: Implementation :: PyPy
Classifier: Topic :: Text Processing :: Linguistic
Classifier: Topic :: Utilities
Classifier: Typing :: Typed
Requires-Python: >=3.7
Description-Content-Type: text/markdown
License-File: LICENSE
Provides-Extra: unicode-backport
<h1 align="center">Charset Detection, for Everyone 👋</h1>
<p align="center">
<sup>The Real First Universal Charset Detector</sup><br>
<a href="https://pypi.org/project/charset-normalizer">
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
</a>
<a href="https://pepy.tech/project/charset-normalizer/">
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
</a>
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
</a>
</p>
<p align="center">
<sup><i>Featured Packages</i></sup><br>
<a href="https://github.com/jawah/niquests">
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Best_HTTP_Client-cyan">
</a>
<a href="https://github.com/jawah/wassima">
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Killer-cyan">
</a>
</p>
<p align="center">
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
<a href="https://github.com/nickspring/charset-normalizer-rs">
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
</a>
</p>
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
> I'm trying to resolve the issue by taking a new approach.
> All IANA character set names for which the Python core library provides codecs are supported.
<p align="center">
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
</p>
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
| `Fast` | ❌ | ✅ | ✅ |
| `Universal**` | ❌ | ✅ | ❌ |
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
| `Native Python` | ✅ | ✅ | ❌ |
| `Detect spoken language` | ❌ | ✅ | N/A |
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
<p align="center">
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
</p>
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
## ⚡ Performance
This package offer better performance than its counterpart Chardet. Here are some numbers.
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
| Package | 99th percentile | 95th percentile | 50th percentile |
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
| charset-normalizer | 100 ms | 50 ms | 5 ms |
_updated as of december 2024 using CPython 3.12_
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
> And yes, these results might change at any time. The dataset can be updated to include more files.
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
> (e.g. Supported Encoding) Challenge-them if you want.
## ✨ Installation
Using pip:
```sh
pip install charset-normalizer -U
```
## 🚀 Basic Usage
### CLI
This package comes with a CLI.
```
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
file [file ...]
The Real First Universal Charset Detector. Discover originating encoding used
on text file. Normalize text to unicode.
positional arguments:
files File(s) to be analysed
optional arguments:
-h, --help show this help message and exit
-v, --verbose Display complementary information about file if any.
Stdout will contain logs about the detection process.
-a, --with-alternative
Output complementary possibilities if any. Top-level
JSON WILL be a list.
-n, --normalize Permit to normalize input file. If not set, program
does not write anything.
-m, --minimal Only output the charset detected to STDOUT. Disabling
JSON output.
-r, --replace Replace file when trying to normalize it instead of
creating a new one.
-f, --force Replace file without asking if you are sure, use this
flag with caution.
-t THRESHOLD, --threshold THRESHOLD
Define a custom maximum amount of chaos allowed in
decoded content. 0. <= chaos <= 1.
--version Show version information and exit.
```
```bash
normalizer ./data/sample.1.fr.srt
```
or
```bash
python -m charset_normalizer ./data/sample.1.fr.srt
```
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
```json
{
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
"encoding": "cp1252",
"encoding_aliases": [
"1252",
"windows_1252"
],
"alternative_encodings": [
"cp1254",
"cp1256",
"cp1258",
"iso8859_14",
"iso8859_15",
"iso8859_16",
"iso8859_3",
"iso8859_9",
"latin_1",
"mbcs"
],
"language": "French",
"alphabets": [
"Basic Latin",
"Latin-1 Supplement"
],
"has_sig_or_bom": false,
"chaos": 0.149,
"coherence": 97.152,
"unicode_path": null,
"is_preferred": true
}
```
### Python
*Just print out normalized text*
```python
from charset_normalizer import from_path
results = from_path('./my_subtitle.srt')
print(str(results.best()))
```
*Upgrade your code without effort*
```python
from charset_normalizer import detect
```
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
## 😇 Why
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
reliable alternative using a completely different method. Also! I never back down on a good challenge!
I **don't care** about the **originating charset** encoding, because **two different tables** can
produce **two identical rendered string.**
What I want is to get readable text, the best I can.
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
## 🍰 How
- Discard all charset encoding table that could not fit the binary content.
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
- Extract matches with the lowest mess detected.
- Additionally, we measure coherence / probe for a language.
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
improve or rewrite it.
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
## ⚡ Known limitations
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
## ⚠️ About Python EOLs
**If you are running:**
- Python >=2.7,<3.5: Unsupported
- Python 3.5: charset-normalizer < 2.1
- Python 3.6: charset-normalizer < 3.1
- Python 3.7: charset-normalizer < 4.0
Upgrade your Python interpreter as soon as possible.
## 👤 Contributing
Contributions, issues and feature requests are very much welcome.<br />
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
## 📝 License
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
## 💼 For Enterprise
Professional support for charset-normalizer is available as part of the [Tidelift
Subscription][1]. Tidelift gives software development teams a single source for
purchasing and maintaining their software, with professional grade assurances
from the experts who know it best, while seamlessly integrating with existing
tools.
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7297/badge)](https://www.bestpractices.dev/projects/7297)
# Changelog
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
### Changed
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
- Enforce annotation delayed loading for a simpler and consistent types in the project.
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
### Added
- pre-commit configuration.
- noxfile.
### Removed
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
- Unused `utils.range_scan` function.
### Fixed
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
### Added
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
- Support for Python 3.13 (#512)
### Fixed
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
### Fixed
- Unintentional memory usage regression when using large payload that match several encoding (#376)
- Regression on some detection case showcased in the documentation (#371)
### Added
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
### Changed
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
- Improved the general detection reliability based on reports from the community
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
### Added
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
### Removed
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
### Changed
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
### Fixed
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
### Changed
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
- Minor improvement over the global detection reliability
### Added
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
- Explicit support for Python 3.12
### Fixed
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
### Added
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
### Removed
- Support for Python 3.6 (PR #260)
### Changed
- Optional speedup provided by mypy/c 1.0.1
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
### Fixed
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
### Changed
- Speedup provided by mypy/c 0.990 on Python >= 3.7
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
- Sphinx warnings when generating the documentation
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
### Added
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
### Changed
- Build with static metadata using 'build' frontend
- Make the language detection stricter
### Fixed
- CLI with opt --normalize fail when using full path for files
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
### Removed
- Coherence detector no longer return 'Simple English' instead return 'English'
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
### Added
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
### Removed
- Breaking: Method `first()` and `best()` from CharsetMatch
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
### Fixed
- Sphinx warnings when generating the documentation
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
### Changed
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
### Removed
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
- Breaking: Top-level function `normalize`
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
- Support for the backport `unicodedata2`
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
### Deprecated
- Function `normalize` scheduled for removal in 3.0
### Changed
- Removed useless call to decode in fn is_unprintable (#206)
### Fixed
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
### Added
- Output the Unicode table version when running the CLI with `--version` (PR #194)
### Changed
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
### Fixed
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
### Removed
- Support for Python 3.5 (PR #192)
### Deprecated
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
### Fixed
- ASCII miss-detection on rare cases (PR #170)
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
### Added
- Explicit support for Python 3.11 (PR #164)
### Changed
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
### Fixed
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
### Changed
- Skipping the language-detection (CD) on ASCII (PR #155)
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
### Changed
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
### Fixed
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
### Changed
- Improvement over Vietnamese detection (PR #126)
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
- Code style as refactored by Sourcery-AI (PR #131)
- Minor adjustment on the MD around european words (PR #133)
- Remove and replace SRTs from assets / tests (PR #139)
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
### Fixed
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
- Avoid using too insignificant chunk (PR #137)
### Added
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
### Added
- Add support for Kazakh (Cyrillic) language detection (PR #109)
### Changed
- Further, improve inferring the language from a given single-byte code page (PR #112)
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
- Various detection improvement (MD+CD) (PR #117)
### Removed
- Remove redundant logging entry about detected language(s) (PR #115)
### Fixed
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
### Fixed
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
- Fix CLI crash when using --minimal output in certain cases (PR #103)
### Changed
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
### Changed
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
- The Unicode detection is slightly improved (PR #93)
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
### Removed
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
### Fixed
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
- The MANIFEST.in was not exhaustive (PR #78)
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
### Fixed
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
- Submatch factoring could be wrong in rare edge cases (PR #72)
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
- Fix line endings from CRLF to LF for certain project files (PR #67)
### Changed
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
- Allow fallback on specified encoding if any (PR #71)
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
### Changed
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
### Fixed
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
### Changed
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
### Fixed
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
### Changed
- Public function normalize default args values were not aligned with from_bytes (PR #53)
### Added
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
### Changed
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
- utf_7 detection has been reinstated.
### Removed
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
- The exception hook on UnicodeDecodeError has been removed.
### Deprecated
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
### Fixed
- The CLI output used the relative path of the file(s). Should be absolute.
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
### Fixed
- Logger configuration/usage no longer conflict with others (PR #44)
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
### Removed
- Using standard logging instead of using the package loguru.
- Dropping nose test framework in favor of the maintained pytest.
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
- Stop support for UTF-7 that does not contain a SIG.
- Dropping PrettyTable, replaced with pure JSON output in CLI.
### Fixed
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
- Not searching properly for the BOM when trying utf32/16 parent codec.
### Changed
- Improving the package final size by compressing frequencies.json.
- Huge improvement over the larges payload.
### Added
- CLI now produces JSON consumable output.
- Return ASCII if given sequences fit. Given reasonable confidence.
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
### Fixed
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
### Fixed
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
### Fixed
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
### Changed
- Amend the previous release to allow prettytable 2.0 (PR #35)
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
### Fixed
- Fix error while using the package with a python pre-release interpreter (PR #33)
### Changed
- Dependencies refactoring, constraints revised.
### Added
- Add python 3.9 and 3.10 to the supported interpreters
MIT License
Copyright (c) 2025 TAHRI Ahmed R.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -0,0 +1,35 @@
../../../bin/normalizer,sha256=lVuCYZlIlimMhsrs9VuhY42OkZOAg4VZbVVZ8-cP38M,257
charset_normalizer-3.4.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
charset_normalizer-3.4.1.dist-info/LICENSE,sha256=bQ1Bv-FwrGx9wkjJpj4lTQ-0WmDVCoJX0K-SxuJJuIc,1071
charset_normalizer-3.4.1.dist-info/METADATA,sha256=JbyHzhmqZh_ugEn1Y7TY7CDYZA9FoU6BP25hrCNDf50,35313
charset_normalizer-3.4.1.dist-info/RECORD,,
charset_normalizer-3.4.1.dist-info/WHEEL,sha256=ZiHiI0fxbnsGhDML32hrhH3YKU2c-6yRirdNq7QKO5A,153
charset_normalizer-3.4.1.dist-info/entry_points.txt,sha256=8C-Y3iXIfyXQ83Tpir2B8t-XLJYpxF5xbb38d_js-h4,65
charset_normalizer-3.4.1.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
charset_normalizer/__init__.py,sha256=OKRxRv2Zhnqk00tqkN0c1BtJjm165fWXLydE52IKuHc,1590
charset_normalizer/__main__.py,sha256=yzYxMR-IhKRHYwcSlavEv8oGdwxsR89mr2X09qXGdps,109
charset_normalizer/__pycache__/__init__.cpython-311.pyc,,
charset_normalizer/__pycache__/__main__.cpython-311.pyc,,
charset_normalizer/__pycache__/api.cpython-311.pyc,,
charset_normalizer/__pycache__/cd.cpython-311.pyc,,
charset_normalizer/__pycache__/constant.cpython-311.pyc,,
charset_normalizer/__pycache__/legacy.cpython-311.pyc,,
charset_normalizer/__pycache__/md.cpython-311.pyc,,
charset_normalizer/__pycache__/models.cpython-311.pyc,,
charset_normalizer/__pycache__/utils.cpython-311.pyc,,
charset_normalizer/__pycache__/version.cpython-311.pyc,,
charset_normalizer/api.py,sha256=qBRz8mJ_R5E713R6TOyqHEdnmyxbEDnCSHvx32ubDGg,22617
charset_normalizer/cd.py,sha256=WKTo1HDb-H9HfCDc3Bfwq5jzS25Ziy9SE2a74SgTq88,12522
charset_normalizer/cli/__init__.py,sha256=D8I86lFk2-py45JvqxniTirSj_sFyE6sjaY_0-G1shc,136
charset_normalizer/cli/__main__.py,sha256=VGC9klOoi6_R2z8rmyrc936kv7u2A1udjjHtlmNPDTM,10410
charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc,,
charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc,,
charset_normalizer/constant.py,sha256=4VuTcZNLew1j_8ixA-Rt_VVqNWD4pwgHOHMCMlr0964,40477
charset_normalizer/legacy.py,sha256=yhNXsPHkBfqPXKRb-sPXNj3Bscp9-mFGcYOkJ62tg9c,2328
charset_normalizer/md.cpython-311-aarch64-linux-gnu.so,sha256=5hU4sWeM3XiR2vLRWyKxBqAMGpadzRWcsPw0feJSHM0,69800
charset_normalizer/md.py,sha256=iyXXQGWl54nnLQLueMWTmUtlivO0-rTBgVkmJxIIAGU,20036
charset_normalizer/md__mypyc.cpython-311-aarch64-linux-gnu.so,sha256=D65ppZ_SHkizbNWYKkGWFif-mnRWW5qQbIiYgS0MFcU,321840
charset_normalizer/models.py,sha256=lKXhOnIPtiakbK3i__J9wpOfzx3JDTKj7Dn3Rg0VaRI,12394
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
charset_normalizer/utils.py,sha256=T5UHo8AS7NVMmgruWoZyqEf0WrZVcQpgUNetRoborSk,12002
charset_normalizer/version.py,sha256=Ambcj3O8FfvdLfDLc8dkaxZx97O1IM_R4_aKGD_TDdE,115

View File

@ -0,0 +1,6 @@
Wheel-Version: 1.0
Generator: setuptools (75.6.0)
Root-Is-Purelib: false
Tag: cp311-cp311-manylinux_2_17_aarch64
Tag: cp311-cp311-manylinux2014_aarch64

View File

@ -0,0 +1,2 @@
[console_scripts]
normalizer = charset_normalizer:cli.cli_detect

View File

@ -0,0 +1 @@
charset_normalizer

View File

@ -0,0 +1,48 @@
"""
Charset-Normalizer
~~~~~~~~~~~~~~
The Real First Universal Charset Detector.
A library that helps you read text from an unknown charset encoding.
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
All IANA character set names for which the Python core library provides codecs are supported.
Basic usage:
>>> from charset_normalizer import from_bytes
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
>>> best_guess = results.best()
>>> str(best_guess)
'Bсеки човек има право на образование. Oбразованието!'
Others methods and usages are available - see the full documentation
at <https://github.com/Ousret/charset_normalizer>.
:copyright: (c) 2021 by Ahmed TAHRI
:license: MIT, see LICENSE for more details.
"""
from __future__ import annotations
import logging
from .api import from_bytes, from_fp, from_path, is_binary
from .legacy import detect
from .models import CharsetMatch, CharsetMatches
from .utils import set_logging_handler
from .version import VERSION, __version__
__all__ = (
"from_fp",
"from_path",
"from_bytes",
"is_binary",
"detect",
"CharsetMatch",
"CharsetMatches",
"__version__",
"VERSION",
"set_logging_handler",
)
# Attach a NullHandler to the top level logger by default
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())

View File

@ -0,0 +1,6 @@
from __future__ import annotations
from .cli import cli_detect
if __name__ == "__main__":
cli_detect()

View File

@ -0,0 +1,668 @@
from __future__ import annotations
import logging
from os import PathLike
from typing import BinaryIO
from .cd import (
coherence_ratio,
encoding_languages,
mb_encoding_languages,
merge_coherence_ratios,
)
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
from .md import mess_ratio
from .models import CharsetMatch, CharsetMatches
from .utils import (
any_specified_encoding,
cut_sequence_chunks,
iana_name,
identify_sig_or_bom,
is_cp_similar,
is_multi_byte_encoding,
should_strip_sig_or_bom,
)
logger = logging.getLogger("charset_normalizer")
explain_handler = logging.StreamHandler()
explain_handler.setFormatter(
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
)
def from_bytes(
sequences: bytes | bytearray,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.2,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
If there is no results, it is a strong indicator that the source is binary/not text.
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
but never take it for granted. Can improve the performance.
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
purpose.
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
Custom logging format and handler can be set manually.
"""
if not isinstance(sequences, (bytearray, bytes)):
raise TypeError(
"Expected object of type bytes or bytearray, got: {}".format(
type(sequences)
)
)
if explain:
previous_logger_level: int = logger.level
logger.addHandler(explain_handler)
logger.setLevel(TRACE)
length: int = len(sequences)
if length == 0:
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level or logging.WARNING)
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
if cp_isolation is not None:
logger.log(
TRACE,
"cp_isolation is set. use this flag for debugging purpose. "
"limited list of encoding allowed : %s.",
", ".join(cp_isolation),
)
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
else:
cp_isolation = []
if cp_exclusion is not None:
logger.log(
TRACE,
"cp_exclusion is set. use this flag for debugging purpose. "
"limited list of encoding excluded : %s.",
", ".join(cp_exclusion),
)
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
else:
cp_exclusion = []
if length <= (chunk_size * steps):
logger.log(
TRACE,
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
steps,
chunk_size,
length,
)
steps = 1
chunk_size = length
if steps > 1 and length / steps < chunk_size:
chunk_size = int(length / steps)
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
if is_too_small_sequence:
logger.log(
TRACE,
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
length
),
)
elif is_too_large_sequence:
logger.log(
TRACE,
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
length
),
)
prioritized_encodings: list[str] = []
specified_encoding: str | None = (
any_specified_encoding(sequences) if preemptive_behaviour else None
)
if specified_encoding is not None:
prioritized_encodings.append(specified_encoding)
logger.log(
TRACE,
"Detected declarative mark in sequence. Priority +1 given for %s.",
specified_encoding,
)
tested: set[str] = set()
tested_but_hard_failure: list[str] = []
tested_but_soft_failure: list[str] = []
fallback_ascii: CharsetMatch | None = None
fallback_u8: CharsetMatch | None = None
fallback_specified: CharsetMatch | None = None
results: CharsetMatches = CharsetMatches()
early_stop_results: CharsetMatches = CharsetMatches()
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
if sig_encoding is not None:
prioritized_encodings.append(sig_encoding)
logger.log(
TRACE,
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
len(sig_payload),
sig_encoding,
)
prioritized_encodings.append("ascii")
if "utf_8" not in prioritized_encodings:
prioritized_encodings.append("utf_8")
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
if cp_isolation and encoding_iana not in cp_isolation:
continue
if cp_exclusion and encoding_iana in cp_exclusion:
continue
if encoding_iana in tested:
continue
tested.add(encoding_iana)
decoded_payload: str | None = None
bom_or_sig_available: bool = sig_encoding == encoding_iana
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
encoding_iana
)
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
encoding_iana,
)
continue
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
logger.log(
TRACE,
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
encoding_iana,
)
continue
try:
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
except (ModuleNotFoundError, ImportError):
logger.log(
TRACE,
"Encoding %s does not provide an IncrementalDecoder",
encoding_iana,
)
continue
try:
if is_too_large_sequence and is_multi_byte_decoder is False:
str(
(
sequences[: int(50e4)]
if strip_sig_or_bom is False
else sequences[len(sig_payload) : int(50e4)]
),
encoding=encoding_iana,
)
else:
decoded_payload = str(
(
sequences
if strip_sig_or_bom is False
else sequences[len(sig_payload) :]
),
encoding=encoding_iana,
)
except (UnicodeDecodeError, LookupError) as e:
if not isinstance(e, LookupError):
logger.log(
TRACE,
"Code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
similar_soft_failure_test: bool = False
for encoding_soft_failed in tested_but_soft_failure:
if is_cp_similar(encoding_iana, encoding_soft_failed):
similar_soft_failure_test = True
break
if similar_soft_failure_test:
logger.log(
TRACE,
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
encoding_iana,
encoding_soft_failed,
)
continue
r_ = range(
0 if not bom_or_sig_available else len(sig_payload),
length,
int(length / steps),
)
multi_byte_bonus: bool = (
is_multi_byte_decoder
and decoded_payload is not None
and len(decoded_payload) < length
)
if multi_byte_bonus:
logger.log(
TRACE,
"Code page %s is a multi byte encoding table and it appear that at least one character "
"was encoded using n-bytes.",
encoding_iana,
)
max_chunk_gave_up: int = int(len(r_) / 4)
max_chunk_gave_up = max(max_chunk_gave_up, 2)
early_stop_count: int = 0
lazy_str_hard_failure = False
md_chunks: list[str] = []
md_ratios = []
try:
for chunk in cut_sequence_chunks(
sequences,
encoding_iana,
r_,
chunk_size,
bom_or_sig_available,
strip_sig_or_bom,
sig_payload,
is_multi_byte_decoder,
decoded_payload,
):
md_chunks.append(chunk)
md_ratios.append(
mess_ratio(
chunk,
threshold,
explain is True and 1 <= len(cp_isolation) <= 2,
)
)
if md_ratios[-1] >= threshold:
early_stop_count += 1
if (early_stop_count >= max_chunk_gave_up) or (
bom_or_sig_available and strip_sig_or_bom is False
):
break
except (
UnicodeDecodeError
) as e: # Lazy str loading may have missed something there
logger.log(
TRACE,
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
early_stop_count = max_chunk_gave_up
lazy_str_hard_failure = True
# We might want to check the sequence again with the whole content
# Only if initial MD tests passes
if (
not lazy_str_hard_failure
and is_too_large_sequence
and not is_multi_byte_decoder
):
try:
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
except UnicodeDecodeError as e:
logger.log(
TRACE,
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
encoding_iana,
str(e),
)
tested_but_hard_failure.append(encoding_iana)
continue
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
tested_but_soft_failure.append(encoding_iana)
logger.log(
TRACE,
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
"Computed mean chaos is %f %%.",
encoding_iana,
early_stop_count,
round(mean_mess_ratio * 100, ndigits=3),
)
# Preparing those fallbacks in case we got nothing.
if (
enable_fallback
and encoding_iana in ["ascii", "utf_8", specified_encoding]
and not lazy_str_hard_failure
):
fallback_entry = CharsetMatch(
sequences,
encoding_iana,
threshold,
False,
[],
decoded_payload,
preemptive_declaration=specified_encoding,
)
if encoding_iana == specified_encoding:
fallback_specified = fallback_entry
elif encoding_iana == "ascii":
fallback_ascii = fallback_entry
else:
fallback_u8 = fallback_entry
continue
logger.log(
TRACE,
"%s passed initial chaos probing. Mean measured chaos is %f %%",
encoding_iana,
round(mean_mess_ratio * 100, ndigits=3),
)
if not is_multi_byte_decoder:
target_languages: list[str] = encoding_languages(encoding_iana)
else:
target_languages = mb_encoding_languages(encoding_iana)
if target_languages:
logger.log(
TRACE,
"{} should target any language(s) of {}".format(
encoding_iana, str(target_languages)
),
)
cd_ratios = []
# We shall skip the CD when its about ASCII
# Most of the time its not relevant to run "language-detection" on it.
if encoding_iana != "ascii":
for chunk in md_chunks:
chunk_languages = coherence_ratio(
chunk,
language_threshold,
",".join(target_languages) if target_languages else None,
)
cd_ratios.append(chunk_languages)
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
if cd_ratios_merged:
logger.log(
TRACE,
"We detected language {} using {}".format(
cd_ratios_merged, encoding_iana
),
)
current_match = CharsetMatch(
sequences,
encoding_iana,
mean_mess_ratio,
bom_or_sig_available,
cd_ratios_merged,
(
decoded_payload
if (
is_too_large_sequence is False
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
)
else None
),
preemptive_declaration=specified_encoding,
)
results.append(current_match)
if (
encoding_iana in [specified_encoding, "ascii", "utf_8"]
and mean_mess_ratio < 0.1
):
# If md says nothing to worry about, then... stop immediately!
if mean_mess_ratio == 0.0:
logger.debug(
"Encoding detection: %s is most likely the one.",
current_match.encoding,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([current_match])
early_stop_results.append(current_match)
if (
len(early_stop_results)
and (specified_encoding is None or specified_encoding in tested)
and "ascii" in tested
and "utf_8" in tested
):
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
logger.debug(
"Encoding detection: %s is most likely the one.",
probable_result.encoding,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([probable_result])
if encoding_iana == sig_encoding:
logger.debug(
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
"the beginning of the sequence.",
encoding_iana,
)
if explain: # Defensive: ensure exit path clean handler
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return CharsetMatches([results[encoding_iana]])
if len(results) == 0:
if fallback_u8 or fallback_ascii or fallback_specified:
logger.log(
TRACE,
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
)
if fallback_specified:
logger.debug(
"Encoding detection: %s will be used as a fallback match",
fallback_specified.encoding,
)
results.append(fallback_specified)
elif (
(fallback_u8 and fallback_ascii is None)
or (
fallback_u8
and fallback_ascii
and fallback_u8.fingerprint != fallback_ascii.fingerprint
)
or (fallback_u8 is not None)
):
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
results.append(fallback_u8)
elif fallback_ascii:
logger.debug("Encoding detection: ascii will be used as a fallback match")
results.append(fallback_ascii)
if results:
logger.debug(
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
results.best().encoding, # type: ignore
len(results) - 1,
)
else:
logger.debug("Encoding detection: Unable to determine any suitable charset.")
if explain:
logger.removeHandler(explain_handler)
logger.setLevel(previous_logger_level)
return results
def from_fp(
fp: BinaryIO,
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but using a file pointer that is already ready.
Will not close the file pointer.
"""
return from_bytes(
fp.read(),
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def from_path(
path: str | bytes | PathLike, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = True,
) -> CharsetMatches:
"""
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
Can raise IOError.
"""
with open(path, "rb") as fp:
return from_fp(
fp,
steps,
chunk_size,
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain,
language_threshold,
enable_fallback,
)
def is_binary(
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
steps: int = 5,
chunk_size: int = 512,
threshold: float = 0.20,
cp_isolation: list[str] | None = None,
cp_exclusion: list[str] | None = None,
preemptive_behaviour: bool = True,
explain: bool = False,
language_threshold: float = 0.1,
enable_fallback: bool = False,
) -> bool:
"""
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
"""
if isinstance(fp_or_path_or_payload, (str, PathLike)):
guesses = from_path(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
elif isinstance(
fp_or_path_or_payload,
(
bytes,
bytearray,
),
):
guesses = from_bytes(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
else:
guesses = from_fp(
fp_or_path_or_payload,
steps=steps,
chunk_size=chunk_size,
threshold=threshold,
cp_isolation=cp_isolation,
cp_exclusion=cp_exclusion,
preemptive_behaviour=preemptive_behaviour,
explain=explain,
language_threshold=language_threshold,
enable_fallback=enable_fallback,
)
return not guesses

View File

@ -0,0 +1,395 @@
from __future__ import annotations
import importlib
from codecs import IncrementalDecoder
from collections import Counter
from functools import lru_cache
from typing import Counter as TypeCounter
from .constant import (
FREQUENCIES,
KO_NAMES,
LANGUAGE_SUPPORTED_COUNT,
TOO_SMALL_SEQUENCE,
ZH_NAMES,
)
from .md import is_suspiciously_successive_range
from .models import CoherenceMatches
from .utils import (
is_accentuated,
is_latin,
is_multi_byte_encoding,
is_unicode_range_secondary,
unicode_range,
)
def encoding_unicode_range(iana_name: str) -> list[str]:
"""
Return associated unicode ranges in a single byte code page.
"""
if is_multi_byte_encoding(iana_name):
raise OSError("Function not supported on multi-byte code page")
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
p: IncrementalDecoder = decoder(errors="ignore")
seen_ranges: dict[str, int] = {}
character_count: int = 0
for i in range(0x40, 0xFF):
chunk: str = p.decode(bytes([i]))
if chunk:
character_range: str | None = unicode_range(chunk)
if character_range is None:
continue
if is_unicode_range_secondary(character_range) is False:
if character_range not in seen_ranges:
seen_ranges[character_range] = 0
seen_ranges[character_range] += 1
character_count += 1
return sorted(
[
character_range
for character_range in seen_ranges
if seen_ranges[character_range] / character_count >= 0.15
]
)
def unicode_range_languages(primary_range: str) -> list[str]:
"""
Return inferred languages used with a unicode range.
"""
languages: list[str] = []
for language, characters in FREQUENCIES.items():
for character in characters:
if unicode_range(character) == primary_range:
languages.append(language)
break
return languages
@lru_cache()
def encoding_languages(iana_name: str) -> list[str]:
"""
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
primary_range: str | None = None
for specified_range in unicode_ranges:
if "Latin" not in specified_range:
primary_range = specified_range
break
if primary_range is None:
return ["Latin Based"]
return unicode_range_languages(primary_range)
@lru_cache()
def mb_encoding_languages(iana_name: str) -> list[str]:
"""
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
This function does the correspondence.
"""
if (
iana_name.startswith("shift_")
or iana_name.startswith("iso2022_jp")
or iana_name.startswith("euc_j")
or iana_name == "cp932"
):
return ["Japanese"]
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
return ["Chinese"]
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
return ["Korean"]
return []
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
def get_target_features(language: str) -> tuple[bool, bool]:
"""
Determine main aspects from a supported language if it contains accents and if is pure Latin.
"""
target_have_accents: bool = False
target_pure_latin: bool = True
for character in FREQUENCIES[language]:
if not target_have_accents and is_accentuated(character):
target_have_accents = True
if target_pure_latin and is_latin(character) is False:
target_pure_latin = False
return target_have_accents, target_pure_latin
def alphabet_languages(
characters: list[str], ignore_non_latin: bool = False
) -> list[str]:
"""
Return associated languages associated to given characters.
"""
languages: list[tuple[str, float]] = []
source_have_accents = any(is_accentuated(character) for character in characters)
for language, language_characters in FREQUENCIES.items():
target_have_accents, target_pure_latin = get_target_features(language)
if ignore_non_latin and target_pure_latin is False:
continue
if target_have_accents is False and source_have_accents:
continue
character_count: int = len(language_characters)
character_match_count: int = len(
[c for c in language_characters if c in characters]
)
ratio: float = character_match_count / character_count
if ratio >= 0.2:
languages.append((language, ratio))
languages = sorted(languages, key=lambda x: x[1], reverse=True)
return [compatible_language[0] for compatible_language in languages]
def characters_popularity_compare(
language: str, ordered_characters: list[str]
) -> float:
"""
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
"""
if language not in FREQUENCIES:
raise ValueError(f"{language} not available")
character_approved_count: int = 0
FREQUENCIES_language_set = set(FREQUENCIES[language])
ordered_characters_count: int = len(ordered_characters)
target_language_characters_count: int = len(FREQUENCIES[language])
large_alphabet: bool = target_language_characters_count > 26
for character, character_rank in zip(
ordered_characters, range(0, ordered_characters_count)
):
if character not in FREQUENCIES_language_set:
continue
character_rank_in_language: int = FREQUENCIES[language].index(character)
expected_projection_ratio: float = (
target_language_characters_count / ordered_characters_count
)
character_rank_projection: int = int(character_rank * expected_projection_ratio)
if (
large_alphabet is False
and abs(character_rank_projection - character_rank_in_language) > 4
):
continue
if (
large_alphabet is True
and abs(character_rank_projection - character_rank_in_language)
< target_language_characters_count / 3
):
character_approved_count += 1
continue
characters_before_source: list[str] = FREQUENCIES[language][
0:character_rank_in_language
]
characters_after_source: list[str] = FREQUENCIES[language][
character_rank_in_language:
]
characters_before: list[str] = ordered_characters[0:character_rank]
characters_after: list[str] = ordered_characters[character_rank:]
before_match_count: int = len(
set(characters_before) & set(characters_before_source)
)
after_match_count: int = len(
set(characters_after) & set(characters_after_source)
)
if len(characters_before_source) == 0 and before_match_count <= 4:
character_approved_count += 1
continue
if len(characters_after_source) == 0 and after_match_count <= 4:
character_approved_count += 1
continue
if (
before_match_count / len(characters_before_source) >= 0.4
or after_match_count / len(characters_after_source) >= 0.4
):
character_approved_count += 1
continue
return character_approved_count / len(ordered_characters)
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
"""
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
One containing the latin letters and the other hebrew.
"""
layers: dict[str, str] = {}
for character in decoded_sequence:
if character.isalpha() is False:
continue
character_range: str | None = unicode_range(character)
if character_range is None:
continue
layer_target_range: str | None = None
for discovered_range in layers:
if (
is_suspiciously_successive_range(discovered_range, character_range)
is False
):
layer_target_range = discovered_range
break
if layer_target_range is None:
layer_target_range = character_range
if layer_target_range not in layers:
layers[layer_target_range] = character.lower()
continue
layers[layer_target_range] += character.lower()
return list(layers.values())
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
"""
This function merge results previously given by the function coherence_ratio.
The return type is the same as coherence_ratio.
"""
per_language_ratios: dict[str, list[float]] = {}
for result in results:
for sub_result in result:
language, ratio = sub_result
if language not in per_language_ratios:
per_language_ratios[language] = [ratio]
continue
per_language_ratios[language].append(ratio)
merge = [
(
language,
round(
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
4,
),
)
for language in per_language_ratios
]
return sorted(merge, key=lambda x: x[1], reverse=True)
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
"""
We shall NOT return "English—" in CoherenceMatches because it is an alternative
of "English". This function only keeps the best match and remove the em-dash in it.
"""
index_results: dict[str, list[float]] = dict()
for result in results:
language, ratio = result
no_em_name: str = language.replace("", "")
if no_em_name not in index_results:
index_results[no_em_name] = []
index_results[no_em_name].append(ratio)
if any(len(index_results[e]) > 1 for e in index_results):
filtered_results: CoherenceMatches = []
for language in index_results:
filtered_results.append((language, max(index_results[language])))
return filtered_results
return results
@lru_cache(maxsize=2048)
def coherence_ratio(
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
) -> CoherenceMatches:
"""
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
A layer = Character extraction by alphabets/ranges.
"""
results: list[tuple[str, float]] = []
ignore_non_latin: bool = False
sufficient_match_count: int = 0
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
if "Latin Based" in lg_inclusion_list:
ignore_non_latin = True
lg_inclusion_list.remove("Latin Based")
for layer in alpha_unicode_split(decoded_sequence):
sequence_frequencies: TypeCounter[str] = Counter(layer)
most_common = sequence_frequencies.most_common()
character_count: int = sum(o for c, o in most_common)
if character_count <= TOO_SMALL_SEQUENCE:
continue
popular_character_ordered: list[str] = [c for c, o in most_common]
for language in lg_inclusion_list or alphabet_languages(
popular_character_ordered, ignore_non_latin
):
ratio: float = characters_popularity_compare(
language, popular_character_ordered
)
if ratio < threshold:
continue
elif ratio >= 0.8:
sufficient_match_count += 1
results.append((language, round(ratio, 4)))
if sufficient_match_count >= 3:
break
return sorted(
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
)

View File

@ -0,0 +1,8 @@
from __future__ import annotations
from .__main__ import cli_detect, query_yes_no
__all__ = (
"cli_detect",
"query_yes_no",
)

View File

@ -0,0 +1,321 @@
from __future__ import annotations
import argparse
import sys
from json import dumps
from os.path import abspath, basename, dirname, join, realpath
from platform import python_version
from unicodedata import unidata_version
import charset_normalizer.md as md_module
from charset_normalizer import from_fp
from charset_normalizer.models import CliDetectionResult
from charset_normalizer.version import __version__
def query_yes_no(question: str, default: str = "yes") -> bool:
"""Ask a yes/no question via input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is True for "yes" or False for "no".
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
"""
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
if default is None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while True:
sys.stdout.write(question + prompt)
choice = input().lower()
if default is not None and choice == "":
return valid[default]
elif choice in valid:
return valid[choice]
else:
sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
def cli_detect(argv: list[str] | None = None) -> int:
"""
CLI assistant using ARGV and ArgumentParser
:param argv:
:return: 0 if everything is fine, anything else equal trouble
"""
parser = argparse.ArgumentParser(
description="The Real First Universal Charset Detector. "
"Discover originating encoding used on text file. "
"Normalize text to unicode."
)
parser.add_argument(
"files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
default=False,
dest="verbose",
help="Display complementary information about file if any. "
"Stdout will contain logs about the detection process.",
)
parser.add_argument(
"-a",
"--with-alternative",
action="store_true",
default=False,
dest="alternatives",
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
)
parser.add_argument(
"-n",
"--normalize",
action="store_true",
default=False,
dest="normalize",
help="Permit to normalize input file. If not set, program does not write anything.",
)
parser.add_argument(
"-m",
"--minimal",
action="store_true",
default=False,
dest="minimal",
help="Only output the charset detected to STDOUT. Disabling JSON output.",
)
parser.add_argument(
"-r",
"--replace",
action="store_true",
default=False,
dest="replace",
help="Replace file when trying to normalize it instead of creating a new one.",
)
parser.add_argument(
"-f",
"--force",
action="store_true",
default=False,
dest="force",
help="Replace file without asking if you are sure, use this flag with caution.",
)
parser.add_argument(
"-i",
"--no-preemptive",
action="store_true",
default=False,
dest="no_preemptive",
help="Disable looking at a charset declaration to hint the detector.",
)
parser.add_argument(
"-t",
"--threshold",
action="store",
default=0.2,
type=float,
dest="threshold",
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
)
parser.add_argument(
"--version",
action="version",
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
__version__,
python_version(),
unidata_version,
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
),
help="Show version information and exit.",
)
args = parser.parse_args(argv)
if args.replace is True and args.normalize is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --replace in addition of --normalize only.", file=sys.stderr)
return 1
if args.force is True and args.replace is False:
if args.files:
for my_file in args.files:
my_file.close()
print("Use --force in addition of --replace only.", file=sys.stderr)
return 1
if args.threshold < 0.0 or args.threshold > 1.0:
if args.files:
for my_file in args.files:
my_file.close()
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
return 1
x_ = []
for my_file in args.files:
matches = from_fp(
my_file,
threshold=args.threshold,
explain=args.verbose,
preemptive_behaviour=args.no_preemptive is False,
)
best_guess = matches.best()
if best_guess is None:
print(
'Unable to identify originating encoding for "{}". {}'.format(
my_file.name,
(
"Maybe try increasing maximum amount of chaos."
if args.threshold < 1.0
else ""
),
),
file=sys.stderr,
)
x_.append(
CliDetectionResult(
abspath(my_file.name),
None,
[],
[],
"Unknown",
[],
False,
1.0,
0.0,
None,
True,
)
)
else:
x_.append(
CliDetectionResult(
abspath(my_file.name),
best_guess.encoding,
best_guess.encoding_aliases,
[
cp
for cp in best_guess.could_be_from_charset
if cp != best_guess.encoding
],
best_guess.language,
best_guess.alphabets,
best_guess.bom,
best_guess.percent_chaos,
best_guess.percent_coherence,
None,
True,
)
)
if len(matches) > 1 and args.alternatives:
for el in matches:
if el != best_guess:
x_.append(
CliDetectionResult(
abspath(my_file.name),
el.encoding,
el.encoding_aliases,
[
cp
for cp in el.could_be_from_charset
if cp != el.encoding
],
el.language,
el.alphabets,
el.bom,
el.percent_chaos,
el.percent_coherence,
None,
False,
)
)
if args.normalize is True:
if best_guess.encoding.startswith("utf") is True:
print(
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
my_file.name
),
file=sys.stderr,
)
if my_file.closed is False:
my_file.close()
continue
dir_path = dirname(realpath(my_file.name))
file_name = basename(realpath(my_file.name))
o_: list[str] = file_name.split(".")
if args.replace is False:
o_.insert(-1, best_guess.encoding)
if my_file.closed is False:
my_file.close()
elif (
args.force is False
and query_yes_no(
'Are you sure to normalize "{}" by replacing it ?'.format(
my_file.name
),
"no",
)
is False
):
if my_file.closed is False:
my_file.close()
continue
try:
x_[0].unicode_path = join(dir_path, ".".join(o_))
with open(x_[0].unicode_path, "wb") as fp:
fp.write(best_guess.output())
except OSError as e:
print(str(e), file=sys.stderr)
if my_file.closed is False:
my_file.close()
return 2
if my_file.closed is False:
my_file.close()
if args.minimal is False:
print(
dumps(
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
ensure_ascii=True,
indent=4,
)
)
else:
for my_file in args.files:
print(
", ".join(
[
el.encoding or "undefined"
for el in x_
if el.path == abspath(my_file.name)
]
)
)
return 0
if __name__ == "__main__":
cli_detect()

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,66 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any
from warnings import warn
from .api import from_bytes
from .constant import CHARDET_CORRESPONDENCE
# TODO: remove this check when dropping Python 3.7 support
if TYPE_CHECKING:
from typing_extensions import TypedDict
class ResultDict(TypedDict):
encoding: str | None
language: str
confidence: float | None
def detect(
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
) -> ResultDict:
"""
chardet legacy method
Detect the encoding of the given byte string. It should be mostly backward-compatible.
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
This function is deprecated and should be used to migrate your project easily, consult the documentation for
further information. Not planned for removal.
:param byte_str: The byte sequence to examine.
:param should_rename_legacy: Should we rename legacy encodings
to their more modern equivalents?
"""
if len(kwargs):
warn(
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
)
if not isinstance(byte_str, (bytearray, bytes)):
raise TypeError( # pragma: nocover
"Expected object of type bytes or bytearray, got: " "{}".format(
type(byte_str)
)
)
if isinstance(byte_str, bytearray):
byte_str = bytes(byte_str)
r = from_bytes(byte_str).best()
encoding = r.encoding if r is not None else None
language = r.language if r is not None and r.language != "Unknown" else ""
confidence = 1.0 - r.chaos if r is not None else None
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
# but chardet does return 'utf-8-sig' and it is a valid codec name.
if r is not None and encoding == "utf_8" and r.bom:
encoding += "_sig"
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
encoding = CHARDET_CORRESPONDENCE[encoding]
return {
"encoding": encoding,
"language": language,
"confidence": confidence,
}

View File

@ -0,0 +1,630 @@
from __future__ import annotations
from functools import lru_cache
from logging import getLogger
from .constant import (
COMMON_SAFE_ASCII_CHARACTERS,
TRACE,
UNICODE_SECONDARY_RANGE_KEYWORD,
)
from .utils import (
is_accentuated,
is_arabic,
is_arabic_isolated_form,
is_case_variable,
is_cjk,
is_emoticon,
is_hangul,
is_hiragana,
is_katakana,
is_latin,
is_punctuation,
is_separator,
is_symbol,
is_thai,
is_unprintable,
remove_accent,
unicode_range,
)
class MessDetectorPlugin:
"""
Base abstract class used for mess detection plugins.
All detectors MUST extend and implement given methods.
"""
def eligible(self, character: str) -> bool:
"""
Determine if given character should be fed in.
"""
raise NotImplementedError # pragma: nocover
def feed(self, character: str) -> None:
"""
The main routine to be executed upon character.
Insert the logic in witch the text would be considered chaotic.
"""
raise NotImplementedError # pragma: nocover
def reset(self) -> None: # pragma: no cover
"""
Permit to reset the plugin to the initial state.
"""
raise NotImplementedError
@property
def ratio(self) -> float:
"""
Compute the chaos ratio based on what your feed() has seen.
Must NOT be lower than 0.; No restriction gt 0.
"""
raise NotImplementedError # pragma: nocover
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._punctuation_count: int = 0
self._symbol_count: int = 0
self._character_count: int = 0
self._last_printable_char: str | None = None
self._frenzy_symbol_in_word: bool = False
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character != self._last_printable_char
and character not in COMMON_SAFE_ASCII_CHARACTERS
):
if is_punctuation(character):
self._punctuation_count += 1
elif (
character.isdigit() is False
and is_symbol(character)
and is_emoticon(character) is False
):
self._symbol_count += 2
self._last_printable_char = character
def reset(self) -> None: # Abstract
self._punctuation_count = 0
self._character_count = 0
self._symbol_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
ratio_of_punctuation: float = (
self._punctuation_count + self._symbol_count
) / self._character_count
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
class TooManyAccentuatedPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._accentuated_count: int = 0
def eligible(self, character: str) -> bool:
return character.isalpha()
def feed(self, character: str) -> None:
self._character_count += 1
if is_accentuated(character):
self._accentuated_count += 1
def reset(self) -> None: # Abstract
self._character_count = 0
self._accentuated_count = 0
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
ratio_of_accentuation: float = self._accentuated_count / self._character_count
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
class UnprintablePlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._unprintable_count: int = 0
self._character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if is_unprintable(character):
self._unprintable_count += 1
self._character_count += 1
def reset(self) -> None: # Abstract
self._unprintable_count = 0
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._unprintable_count * 8) / self._character_count
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._successive_count: int = 0
self._character_count: int = 0
self._last_latin_character: str | None = None
def eligible(self, character: str) -> bool:
return character.isalpha() and is_latin(character)
def feed(self, character: str) -> None:
self._character_count += 1
if (
self._last_latin_character is not None
and is_accentuated(character)
and is_accentuated(self._last_latin_character)
):
if character.isupper() and self._last_latin_character.isupper():
self._successive_count += 1
# Worse if its the same char duplicated with different accent.
if remove_accent(character) == remove_accent(self._last_latin_character):
self._successive_count += 1
self._last_latin_character = character
def reset(self) -> None: # Abstract
self._successive_count = 0
self._character_count = 0
self._last_latin_character = None
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return (self._successive_count * 2) / self._character_count
class SuspiciousRange(MessDetectorPlugin):
def __init__(self) -> None:
self._suspicious_successive_range_count: int = 0
self._character_count: int = 0
self._last_printable_seen: str | None = None
def eligible(self, character: str) -> bool:
return character.isprintable()
def feed(self, character: str) -> None:
self._character_count += 1
if (
character.isspace()
or is_punctuation(character)
or character in COMMON_SAFE_ASCII_CHARACTERS
):
self._last_printable_seen = None
return
if self._last_printable_seen is None:
self._last_printable_seen = character
return
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
unicode_range_b: str | None = unicode_range(character)
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
self._suspicious_successive_range_count += 1
self._last_printable_seen = character
def reset(self) -> None: # Abstract
self._character_count = 0
self._suspicious_successive_range_count = 0
self._last_printable_seen = None
@property
def ratio(self) -> float:
if self._character_count <= 13:
return 0.0
ratio_of_suspicious_range_usage: float = (
self._suspicious_successive_range_count * 2
) / self._character_count
return ratio_of_suspicious_range_usage
class SuperWeirdWordPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._word_count: int = 0
self._bad_word_count: int = 0
self._foreign_long_count: int = 0
self._is_current_word_bad: bool = False
self._foreign_long_watch: bool = False
self._character_count: int = 0
self._bad_character_count: int = 0
self._buffer: str = ""
self._buffer_accent_count: int = 0
self._buffer_glyph_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character.isalpha():
self._buffer += character
if is_accentuated(character):
self._buffer_accent_count += 1
if (
self._foreign_long_watch is False
and (is_latin(character) is False or is_accentuated(character))
and is_cjk(character) is False
and is_hangul(character) is False
and is_katakana(character) is False
and is_hiragana(character) is False
and is_thai(character) is False
):
self._foreign_long_watch = True
if (
is_cjk(character)
or is_hangul(character)
or is_katakana(character)
or is_hiragana(character)
or is_thai(character)
):
self._buffer_glyph_count += 1
return
if not self._buffer:
return
if (
character.isspace() or is_punctuation(character) or is_separator(character)
) and self._buffer:
self._word_count += 1
buffer_length: int = len(self._buffer)
self._character_count += buffer_length
if buffer_length >= 4:
if self._buffer_accent_count / buffer_length >= 0.5:
self._is_current_word_bad = True
# Word/Buffer ending with an upper case accentuated letter are so rare,
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
elif (
is_accentuated(self._buffer[-1])
and self._buffer[-1].isupper()
and all(_.isupper() for _ in self._buffer) is False
):
self._foreign_long_count += 1
self._is_current_word_bad = True
elif self._buffer_glyph_count == 1:
self._is_current_word_bad = True
self._foreign_long_count += 1
if buffer_length >= 24 and self._foreign_long_watch:
camel_case_dst = [
i
for c, i in zip(self._buffer, range(0, buffer_length))
if c.isupper()
]
probable_camel_cased: bool = False
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
probable_camel_cased = True
if not probable_camel_cased:
self._foreign_long_count += 1
self._is_current_word_bad = True
if self._is_current_word_bad:
self._bad_word_count += 1
self._bad_character_count += len(self._buffer)
self._is_current_word_bad = False
self._foreign_long_watch = False
self._buffer = ""
self._buffer_accent_count = 0
self._buffer_glyph_count = 0
elif (
character not in {"<", ">", "-", "=", "~", "|", "_"}
and character.isdigit() is False
and is_symbol(character)
):
self._is_current_word_bad = True
self._buffer += character
def reset(self) -> None: # Abstract
self._buffer = ""
self._is_current_word_bad = False
self._foreign_long_watch = False
self._bad_word_count = 0
self._word_count = 0
self._character_count = 0
self._bad_character_count = 0
self._foreign_long_count = 0
@property
def ratio(self) -> float:
if self._word_count <= 10 and self._foreign_long_count == 0:
return 0.0
return self._bad_character_count / self._character_count
class CjkInvalidStopPlugin(MessDetectorPlugin):
"""
GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
can be easily detected. Searching for the overuse of '' and ''.
"""
def __init__(self) -> None:
self._wrong_stop_count: int = 0
self._cjk_character_count: int = 0
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
if character in {"", ""}:
self._wrong_stop_count += 1
return
if is_cjk(character):
self._cjk_character_count += 1
def reset(self) -> None: # Abstract
self._wrong_stop_count = 0
self._cjk_character_count = 0
@property
def ratio(self) -> float:
if self._cjk_character_count < 16:
return 0.0
return self._wrong_stop_count / self._cjk_character_count
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._buf: bool = False
self._character_count_since_last_sep: int = 0
self._successive_upper_lower_count: int = 0
self._successive_upper_lower_count_final: int = 0
self._character_count: int = 0
self._last_alpha_seen: str | None = None
self._current_ascii_only: bool = True
def eligible(self, character: str) -> bool:
return True
def feed(self, character: str) -> None:
is_concerned = character.isalpha() and is_case_variable(character)
chunk_sep = is_concerned is False
if chunk_sep and self._character_count_since_last_sep > 0:
if (
self._character_count_since_last_sep <= 64
and character.isdigit() is False
and self._current_ascii_only is False
):
self._successive_upper_lower_count_final += (
self._successive_upper_lower_count
)
self._successive_upper_lower_count = 0
self._character_count_since_last_sep = 0
self._last_alpha_seen = None
self._buf = False
self._character_count += 1
self._current_ascii_only = True
return
if self._current_ascii_only is True and character.isascii() is False:
self._current_ascii_only = False
if self._last_alpha_seen is not None:
if (character.isupper() and self._last_alpha_seen.islower()) or (
character.islower() and self._last_alpha_seen.isupper()
):
if self._buf is True:
self._successive_upper_lower_count += 2
self._buf = False
else:
self._buf = True
else:
self._buf = False
self._character_count += 1
self._character_count_since_last_sep += 1
self._last_alpha_seen = character
def reset(self) -> None: # Abstract
self._character_count = 0
self._character_count_since_last_sep = 0
self._successive_upper_lower_count = 0
self._successive_upper_lower_count_final = 0
self._last_alpha_seen = None
self._buf = False
self._current_ascii_only = True
@property
def ratio(self) -> float:
if self._character_count == 0:
return 0.0
return self._successive_upper_lower_count_final / self._character_count
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
def __init__(self) -> None:
self._character_count: int = 0
self._isolated_form_count: int = 0
def reset(self) -> None: # Abstract
self._character_count = 0
self._isolated_form_count = 0
def eligible(self, character: str) -> bool:
return is_arabic(character)
def feed(self, character: str) -> None:
self._character_count += 1
if is_arabic_isolated_form(character):
self._isolated_form_count += 1
@property
def ratio(self) -> float:
if self._character_count < 8:
return 0.0
isolated_form_usage: float = self._isolated_form_count / self._character_count
return isolated_form_usage
@lru_cache(maxsize=1024)
def is_suspiciously_successive_range(
unicode_range_a: str | None, unicode_range_b: str | None
) -> bool:
"""
Determine if two Unicode range seen next to each other can be considered as suspicious.
"""
if unicode_range_a is None or unicode_range_b is None:
return True
if unicode_range_a == unicode_range_b:
return False
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
return False
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
return False
# Latin characters can be accompanied with a combining diacritical mark
# eg. Vietnamese.
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
"Combining" in unicode_range_a or "Combining" in unicode_range_b
):
return False
keywords_range_a, keywords_range_b = (
unicode_range_a.split(" "),
unicode_range_b.split(" "),
)
for el in keywords_range_a:
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
continue
if el in keywords_range_b:
return False
# Japanese Exception
range_a_jp_chars, range_b_jp_chars = (
unicode_range_a
in (
"Hiragana",
"Katakana",
),
unicode_range_b in ("Hiragana", "Katakana"),
)
if (range_a_jp_chars or range_b_jp_chars) and (
"CJK" in unicode_range_a or "CJK" in unicode_range_b
):
return False
if range_a_jp_chars and range_b_jp_chars:
return False
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
# Chinese/Japanese use dedicated range for punctuation and/or separators.
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
unicode_range_a in ["Katakana", "Hiragana"]
and unicode_range_b in ["Katakana", "Hiragana"]
):
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
return False
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
return False
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
return False
return True
@lru_cache(maxsize=2048)
def mess_ratio(
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
) -> float:
"""
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
"""
detectors: list[MessDetectorPlugin] = [
md_class() for md_class in MessDetectorPlugin.__subclasses__()
]
length: int = len(decoded_sequence) + 1
mean_mess_ratio: float = 0.0
if length < 512:
intermediary_mean_mess_ratio_calc: int = 32
elif length <= 1024:
intermediary_mean_mess_ratio_calc = 64
else:
intermediary_mean_mess_ratio_calc = 128
for character, index in zip(decoded_sequence + "\n", range(length)):
for detector in detectors:
if detector.eligible(character):
detector.feed(character)
if (
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
) or index == length - 1:
mean_mess_ratio = sum(dt.ratio for dt in detectors)
if mean_mess_ratio >= maximum_threshold:
break
if debug:
logger = getLogger("charset_normalizer")
logger.log(
TRACE,
"Mess-detector extended-analysis start. "
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
f"maximum_threshold={maximum_threshold}",
)
if len(decoded_sequence) > 16:
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
for dt in detectors:
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
return round(mean_mess_ratio, 3)

View File

@ -0,0 +1,360 @@
from __future__ import annotations
from encodings.aliases import aliases
from hashlib import sha256
from json import dumps
from re import sub
from typing import Any, Iterator, List, Tuple
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
from .utils import iana_name, is_multi_byte_encoding, unicode_range
class CharsetMatch:
def __init__(
self,
payload: bytes,
guessed_encoding: str,
mean_mess_ratio: float,
has_sig_or_bom: bool,
languages: CoherenceMatches,
decoded_payload: str | None = None,
preemptive_declaration: str | None = None,
):
self._payload: bytes = payload
self._encoding: str = guessed_encoding
self._mean_mess_ratio: float = mean_mess_ratio
self._languages: CoherenceMatches = languages
self._has_sig_or_bom: bool = has_sig_or_bom
self._unicode_ranges: list[str] | None = None
self._leaves: list[CharsetMatch] = []
self._mean_coherence_ratio: float = 0.0
self._output_payload: bytes | None = None
self._output_encoding: str | None = None
self._string: str | None = decoded_payload
self._preemptive_declaration: str | None = preemptive_declaration
def __eq__(self, other: object) -> bool:
if not isinstance(other, CharsetMatch):
if isinstance(other, str):
return iana_name(other) == self.encoding
return False
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
def __lt__(self, other: object) -> bool:
"""
Implemented to make sorted available upon CharsetMatches items.
"""
if not isinstance(other, CharsetMatch):
raise ValueError
chaos_difference: float = abs(self.chaos - other.chaos)
coherence_difference: float = abs(self.coherence - other.coherence)
# Below 1% difference --> Use Coherence
if chaos_difference < 0.01 and coherence_difference > 0.02:
return self.coherence > other.coherence
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
# preserve RAM usage!
if len(self._payload) >= TOO_BIG_SEQUENCE:
return self.chaos < other.chaos
return self.multi_byte_usage > other.multi_byte_usage
return self.chaos < other.chaos
@property
def multi_byte_usage(self) -> float:
return 1.0 - (len(str(self)) / len(self.raw))
def __str__(self) -> str:
# Lazy Str Loading
if self._string is None:
self._string = str(self._payload, self._encoding, "strict")
return self._string
def __repr__(self) -> str:
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
def add_submatch(self, other: CharsetMatch) -> None:
if not isinstance(other, CharsetMatch) or other == self:
raise ValueError(
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
other.__class__
)
)
other._string = None # Unload RAM usage; dirty trick.
self._leaves.append(other)
@property
def encoding(self) -> str:
return self._encoding
@property
def encoding_aliases(self) -> list[str]:
"""
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
"""
also_known_as: list[str] = []
for u, p in aliases.items():
if self.encoding == u:
also_known_as.append(p)
elif self.encoding == p:
also_known_as.append(u)
return also_known_as
@property
def bom(self) -> bool:
return self._has_sig_or_bom
@property
def byte_order_mark(self) -> bool:
return self._has_sig_or_bom
@property
def languages(self) -> list[str]:
"""
Return the complete list of possible languages found in decoded sequence.
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
"""
return [e[0] for e in self._languages]
@property
def language(self) -> str:
"""
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
"Unknown".
"""
if not self._languages:
# Trying to infer the language based on the given encoding
# Its either English or we should not pronounce ourselves in certain cases.
if "ascii" in self.could_be_from_charset:
return "English"
# doing it there to avoid circular import
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
languages = (
mb_encoding_languages(self.encoding)
if is_multi_byte_encoding(self.encoding)
else encoding_languages(self.encoding)
)
if len(languages) == 0 or "Latin Based" in languages:
return "Unknown"
return languages[0]
return self._languages[0][0]
@property
def chaos(self) -> float:
return self._mean_mess_ratio
@property
def coherence(self) -> float:
if not self._languages:
return 0.0
return self._languages[0][1]
@property
def percent_chaos(self) -> float:
return round(self.chaos * 100, ndigits=3)
@property
def percent_coherence(self) -> float:
return round(self.coherence * 100, ndigits=3)
@property
def raw(self) -> bytes:
"""
Original untouched bytes.
"""
return self._payload
@property
def submatch(self) -> list[CharsetMatch]:
return self._leaves
@property
def has_submatch(self) -> bool:
return len(self._leaves) > 0
@property
def alphabets(self) -> list[str]:
if self._unicode_ranges is not None:
return self._unicode_ranges
# list detected ranges
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
# filter and sort
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
return self._unicode_ranges
@property
def could_be_from_charset(self) -> list[str]:
"""
The complete list of encoding that output the exact SAME str result and therefore could be the originating
encoding.
This list does include the encoding available in property 'encoding'.
"""
return [self._encoding] + [m.encoding for m in self._leaves]
def output(self, encoding: str = "utf_8") -> bytes:
"""
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
Any errors will be simply ignored by the encoder NOT replaced.
"""
if self._output_encoding is None or self._output_encoding != encoding:
self._output_encoding = encoding
decoded_string = str(self)
if (
self._preemptive_declaration is not None
and self._preemptive_declaration.lower()
not in ["utf-8", "utf8", "utf_8"]
):
patched_header = sub(
RE_POSSIBLE_ENCODING_INDICATION,
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
m.groups()[0],
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
),
decoded_string[:8192],
count=1,
)
decoded_string = patched_header + decoded_string[8192:]
self._output_payload = decoded_string.encode(encoding, "replace")
return self._output_payload # type: ignore
@property
def fingerprint(self) -> str:
"""
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
"""
return sha256(self.output()).hexdigest()
class CharsetMatches:
"""
Container with every CharsetMatch items ordered by default from most probable to the less one.
Act like a list(iterable) but does not implements all related methods.
"""
def __init__(self, results: list[CharsetMatch] | None = None):
self._results: list[CharsetMatch] = sorted(results) if results else []
def __iter__(self) -> Iterator[CharsetMatch]:
yield from self._results
def __getitem__(self, item: int | str) -> CharsetMatch:
"""
Retrieve a single item either by its position or encoding name (alias may be used here).
Raise KeyError upon invalid index or encoding not present in results.
"""
if isinstance(item, int):
return self._results[item]
if isinstance(item, str):
item = iana_name(item, False)
for result in self._results:
if item in result.could_be_from_charset:
return result
raise KeyError
def __len__(self) -> int:
return len(self._results)
def __bool__(self) -> bool:
return len(self._results) > 0
def append(self, item: CharsetMatch) -> None:
"""
Insert a single match. Will be inserted accordingly to preserve sort.
Can be inserted as a submatch.
"""
if not isinstance(item, CharsetMatch):
raise ValueError(
"Cannot append instance '{}' to CharsetMatches".format(
str(item.__class__)
)
)
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
if len(item.raw) < TOO_BIG_SEQUENCE:
for match in self._results:
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
match.add_submatch(item)
return
self._results.append(item)
self._results = sorted(self._results)
def best(self) -> CharsetMatch | None:
"""
Simply return the first match. Strict equivalent to matches[0].
"""
if not self._results:
return None
return self._results[0]
def first(self) -> CharsetMatch | None:
"""
Redundant method, call the method best(). Kept for BC reasons.
"""
return self.best()
CoherenceMatch = Tuple[str, float]
CoherenceMatches = List[CoherenceMatch]
class CliDetectionResult:
def __init__(
self,
path: str,
encoding: str | None,
encoding_aliases: list[str],
alternative_encodings: list[str],
language: str,
alphabets: list[str],
has_sig_or_bom: bool,
chaos: float,
coherence: float,
unicode_path: str | None,
is_preferred: bool,
):
self.path: str = path
self.unicode_path: str | None = unicode_path
self.encoding: str | None = encoding
self.encoding_aliases: list[str] = encoding_aliases
self.alternative_encodings: list[str] = alternative_encodings
self.language: str = language
self.alphabets: list[str] = alphabets
self.has_sig_or_bom: bool = has_sig_or_bom
self.chaos: float = chaos
self.coherence: float = coherence
self.is_preferred: bool = is_preferred
@property
def __dict__(self) -> dict[str, Any]: # type: ignore
return {
"path": self.path,
"encoding": self.encoding,
"encoding_aliases": self.encoding_aliases,
"alternative_encodings": self.alternative_encodings,
"language": self.language,
"alphabets": self.alphabets,
"has_sig_or_bom": self.has_sig_or_bom,
"chaos": self.chaos,
"coherence": self.coherence,
"unicode_path": self.unicode_path,
"is_preferred": self.is_preferred,
}
def to_json(self) -> str:
return dumps(self.__dict__, ensure_ascii=True, indent=4)

View File

@ -0,0 +1,408 @@
from __future__ import annotations
import importlib
import logging
import unicodedata
from codecs import IncrementalDecoder
from encodings.aliases import aliases
from functools import lru_cache
from re import findall
from typing import Generator
from _multibytecodec import ( # type: ignore[import-not-found,import]
MultibyteIncrementalDecoder,
)
from .constant import (
ENCODING_MARKS,
IANA_SUPPORTED_SIMILAR,
RE_POSSIBLE_ENCODING_INDICATION,
UNICODE_RANGES_COMBINED,
UNICODE_SECONDARY_RANGE_KEYWORD,
UTF8_MAXIMAL_ALLOCATION,
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_accentuated(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return (
"WITH GRAVE" in description
or "WITH ACUTE" in description
or "WITH CEDILLA" in description
or "WITH DIAERESIS" in description
or "WITH CIRCUMFLEX" in description
or "WITH TILDE" in description
or "WITH MACRON" in description
or "WITH RING ABOVE" in description
)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def remove_accent(character: str) -> str:
decomposed: str = unicodedata.decomposition(character)
if not decomposed:
return character
codes: list[str] = decomposed.split(" ")
return chr(int(codes[0], 16))
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def unicode_range(character: str) -> str | None:
"""
Retrieve the Unicode range official name from a single character.
"""
character_ord: int = ord(character)
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
if character_ord in ord_range:
return range_name
return None
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_latin(character: str) -> bool:
try:
description: str = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "LATIN" in description
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_punctuation(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "P" in character_category:
return True
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Punctuation" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_symbol(character: str) -> bool:
character_category: str = unicodedata.category(character)
if "S" in character_category or "N" in character_category:
return True
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Forms" in character_range and character_category != "Lo"
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_emoticon(character: str) -> bool:
character_range: str | None = unicode_range(character)
if character_range is None:
return False
return "Emoticons" in character_range or "Pictographs" in character_range
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_separator(character: str) -> bool:
if character.isspace() or character in {"", "+", "<", ">"}:
return True
character_category: str = unicodedata.category(character)
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_case_variable(character: str) -> bool:
return character.islower() != character.isupper()
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_cjk(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "CJK" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hiragana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "HIRAGANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_katakana(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "KATAKANA" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_hangul(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "HANGUL" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_thai(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "THAI" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_arabic_isolated_form(character: str) -> bool:
try:
character_name = unicodedata.name(character)
except ValueError: # Defensive: unicode database outdated?
return False
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
def is_unicode_range_secondary(range_name: str) -> bool:
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
def is_unprintable(character: str) -> bool:
return (
character.isspace() is False # includes \n \t \r \v
and character.isprintable() is False
and character != "\x1a" # Why? Its the ASCII substitute character.
and character != "\ufeff" # bug discovered in Python,
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
)
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
"""
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
"""
if not isinstance(sequence, bytes):
raise TypeError
seq_len: int = len(sequence)
results: list[str] = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
)
if len(results) == 0:
return None
for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if encoding_alias == specified_encoding:
return encoding_iana
if encoding_iana == specified_encoding:
return encoding_iana
return None
@lru_cache(maxsize=128)
def is_multi_byte_encoding(name: str) -> bool:
"""
Verify is a specific encoding is a multi byte one based on it IANA name
"""
return name in {
"utf_8",
"utf_8_sig",
"utf_16",
"utf_16_be",
"utf_16_le",
"utf_32",
"utf_32_le",
"utf_32_be",
"utf_7",
} or issubclass(
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
MultibyteIncrementalDecoder,
)
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
"""
Identify and extract SIG/BOM in given sequence.
"""
for iana_encoding in ENCODING_MARKS:
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
if isinstance(marks, bytes):
marks = [marks]
for mark in marks:
if sequence.startswith(mark):
return iana_encoding, mark
return None, b""
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
return iana_encoding not in {"utf_16", "utf_32"}
def iana_name(cp_name: str, strict: bool = True) -> str:
"""Returns the Python normalized encoding name (Not the IANA official name)."""
cp_name = cp_name.lower().replace("-", "_")
encoding_alias: str
encoding_iana: str
for encoding_alias, encoding_iana in aliases.items():
if cp_name in [encoding_alias, encoding_iana]:
return encoding_iana
if strict:
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
return cp_name
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
return 0.0
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
id_a: IncrementalDecoder = decoder_a(errors="ignore")
id_b: IncrementalDecoder = decoder_b(errors="ignore")
character_match_count: int = 0
for i in range(255):
to_be_decoded: bytes = bytes([i])
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
character_match_count += 1
return character_match_count / 254
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
"""
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
the function cp_similarity.
"""
return (
iana_name_a in IANA_SUPPORTED_SIMILAR
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
)
def set_logging_handler(
name: str = "charset_normalizer",
level: int = logging.INFO,
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
) -> None:
logger = logging.getLogger(name)
logger.setLevel(level)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(format_string))
logger.addHandler(handler)
def cut_sequence_chunks(
sequences: bytes,
encoding_iana: str,
offsets: range,
chunk_size: int,
bom_or_sig_available: bool,
strip_sig_or_bom: bool,
sig_payload: bytes,
is_multi_byte_decoder: bool,
decoded_payload: str | None = None,
) -> Generator[str, None, None]:
if decoded_payload and is_multi_byte_decoder is False:
for i in offsets:
chunk = decoded_payload[i : i + chunk_size]
if not chunk:
break
yield chunk
else:
for i in offsets:
chunk_end = i + chunk_size
if chunk_end > len(sequences) + 8:
continue
cut_sequence = sequences[i : i + chunk_size]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(
encoding_iana,
errors="ignore" if is_multi_byte_decoder else "strict",
)
# multi-byte bad cutting detector and adjustment
# not the cleanest way to perform that fix but clever enough for now.
if is_multi_byte_decoder and i > 0:
chunk_partial_size_chk: int = min(chunk_size, 16)
if (
decoded_payload
and chunk[:chunk_partial_size_chk] not in decoded_payload
):
for j in range(i, i - 4, -1):
cut_sequence = sequences[j:chunk_end]
if bom_or_sig_available and strip_sig_or_bom is False:
cut_sequence = sig_payload + cut_sequence
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
if chunk[:chunk_partial_size_chk] in decoded_payload:
break
yield chunk

View File

@ -0,0 +1,8 @@
"""
Expose version
"""
from __future__ import annotations
__version__ = "3.4.1"
VERSION = __version__.split(".")

View File

@ -0,0 +1 @@
pip

View File

@ -0,0 +1,28 @@
Copyright 2014 Pallets
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,74 @@
Metadata-Version: 2.3
Name: click
Version: 8.1.8
Summary: Composable command line interface toolkit
Maintainer-email: Pallets <contact@palletsprojects.com>
Requires-Python: >=3.7
Description-Content-Type: text/markdown
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: BSD License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Typing :: Typed
Requires-Dist: colorama; platform_system == 'Windows'
Requires-Dist: importlib-metadata; python_version < '3.8'
Project-URL: Changes, https://click.palletsprojects.com/changes/
Project-URL: Chat, https://discord.gg/pallets
Project-URL: Documentation, https://click.palletsprojects.com/
Project-URL: Donate, https://palletsprojects.com/donate
Project-URL: Source, https://github.com/pallets/click/
# $ click_
Click is a Python package for creating beautiful command line interfaces
in a composable way with as little code as necessary. It's the "Command
Line Interface Creation Kit". It's highly configurable but comes with
sensible defaults out of the box.
It aims to make the process of writing command line tools quick and fun
while also preventing any frustration caused by the inability to
implement an intended CLI API.
Click in three points:
- Arbitrary nesting of commands
- Automatic help page generation
- Supports lazy loading of subcommands at runtime
## A Simple Example
```python
import click
@click.command()
@click.option("--count", default=1, help="Number of greetings.")
@click.option("--name", prompt="Your name", help="The person to greet.")
def hello(count, name):
"""Simple program that greets NAME for a total of COUNT times."""
for _ in range(count):
click.echo(f"Hello, {name}!")
if __name__ == '__main__':
hello()
```
```
$ python hello.py --count=3
Your name: Click
Hello, Click!
Hello, Click!
Hello, Click!
```
## Donate
The Pallets organization develops and supports Click and other popular
packages. In order to grow the community of contributors and users, and
allow the maintainers to devote more time to the projects, [please
donate today][].
[please donate today]: https://palletsprojects.com/donate

View File

@ -0,0 +1,38 @@
click-8.1.8.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
click-8.1.8.dist-info/LICENSE.txt,sha256=morRBqOU6FO_4h9C9OctWSgZoigF2ZG18ydQKSkrZY0,1475
click-8.1.8.dist-info/METADATA,sha256=WJtQ6uGS2ybLfvUE4vC0XIhIBr4yFGwjrMBR2fiCQ-Q,2263
click-8.1.8.dist-info/RECORD,,
click-8.1.8.dist-info/WHEEL,sha256=CpUCUxeHQbRN5UGRQHYRJorO5Af-Qy_fHMctcQ8DSGI,82
click/__init__.py,sha256=j1DJeCbga4ribkv5uyvIAzI0oFN13fW9mevDKShFelo,3188
click/__pycache__/__init__.cpython-311.pyc,,
click/__pycache__/_compat.cpython-311.pyc,,
click/__pycache__/_termui_impl.cpython-311.pyc,,
click/__pycache__/_textwrap.cpython-311.pyc,,
click/__pycache__/_winconsole.cpython-311.pyc,,
click/__pycache__/core.cpython-311.pyc,,
click/__pycache__/decorators.cpython-311.pyc,,
click/__pycache__/exceptions.cpython-311.pyc,,
click/__pycache__/formatting.cpython-311.pyc,,
click/__pycache__/globals.cpython-311.pyc,,
click/__pycache__/parser.cpython-311.pyc,,
click/__pycache__/shell_completion.cpython-311.pyc,,
click/__pycache__/termui.cpython-311.pyc,,
click/__pycache__/testing.cpython-311.pyc,,
click/__pycache__/types.cpython-311.pyc,,
click/__pycache__/utils.cpython-311.pyc,,
click/_compat.py,sha256=IGKh_J5QdfKELitnRfTGHneejWxoCw_NX9tfMbdcg3w,18730
click/_termui_impl.py,sha256=a5z7I9gOFeMmu7Gb6_RPyQ8GPuVP1EeblixcWSPSQPk,24783
click/_textwrap.py,sha256=10fQ64OcBUMuK7mFvh8363_uoOxPlRItZBmKzRJDgoY,1353
click/_winconsole.py,sha256=5ju3jQkcZD0W27WEMGqmEP4y_crUVzPCqsX_FYb7BO0,7860
click/core.py,sha256=Q1nEVdctZwvIPOlt4vfHko0TYnHCeE40UEEul8Wpyvs,114748
click/decorators.py,sha256=7t6F-QWowtLh6F_6l-4YV4Y4yNTcqFQEu9i37zIz68s,18925
click/exceptions.py,sha256=V7zDT6emqJ8iNl0kF1P5kpFmLMWQ1T1L7aNNKM4YR0w,9600
click/formatting.py,sha256=Frf0-5W33-loyY_i9qrwXR8-STnW3m5gvyxLVUdyxyk,9706
click/globals.py,sha256=cuJ6Bbo073lgEEmhjr394PeM-QFmXM-Ci-wmfsd7H5g,1954
click/parser.py,sha256=h4sndcpF5OHrZQN8vD8IWb5OByvW7ABbhRToxovrqS8,19067
click/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
click/shell_completion.py,sha256=TR0dXEGcvWb9Eo3aaQEXGhnvNS3FF4H4QcuLnvAvYo4,18636
click/termui.py,sha256=dLxiS70UOvIYBda_nEEZaPAFOVDVmRs1sEPMuLDowQo,28310
click/testing.py,sha256=3RA8anCf7TZ8-5RAF5it2Te-aWXBAL5VLasQnMiC2ZQ,16282
click/types.py,sha256=BD5Qqq4h-8kawBmOIzJlmq4xzThAf4wCvaOLZSBDNx0,36422
click/utils.py,sha256=ce-IrO9ilII76LGkU354pOdHbepM8UftfNH7SfMU_28,20330

Some files were not shown because too many files have changed in this diff Show More