From 258d837dadbc78e0fc3f2d2ff4b52545e7b55f64 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
<161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Wed, 21 May 2025 07:14:01 +0000
Subject: [PATCH] feat: Add initial pronunciation practice tool
This commit introduces a new web application designed to help you improve your English pronunciation.
Key features include:
- **File Upload:** You can upload a TXT file containing English text and an MP3 audio file of your reading.
- **Speech-to-Text (STT):** The backend uses Google Cloud Speech-to-Text (GCP STT) to transcribe your audio. (Requires GCP setup).
- **Text Comparison:** The original text is compared with the transcribed text using difflib to identify discrepancies.
- **Results Display:** The frontend shows the original text, the transcribed text, and a highlighted diff of the comparison.
- **Issue Storage:** Identified pronunciation differences (problematic words/phrases) are stored in your session.
- **Exercise Page:** A dedicated page (`/exercise`) displays these stored issues, providing targeted practice material.
- **Backend & Frontend:** Implemented using Flask for the backend and vanilla JavaScript for the frontend.
- **Unit Tests:** Basic unit tests for the Flask backend logic have been added, covering file handling, STT mocking, session management, and page rendering.
The tool is structured within the `pronunciation_practice_tool` directory, containing templates, static assets, upload handling, and tests.
---
pronunciation_practice_tool/__init__.py | 1 +
pronunciation_practice_tool/app.py | 147 ++++++++++++++++++
pronunciation_practice_tool/requirements.txt | 3 +
pronunciation_practice_tool/static/app.js | 112 +++++++++++++
pronunciation_practice_tool/static/style.css | 12 ++
.../templates/exercise.html | 39 +++++
.../templates/index.html | 27 ++++
pronunciation_practice_tool/tests/__init__.py | 1 +
pronunciation_practice_tool/tests/test_app.py | 123 +++++++++++++++
pronunciation_practice_tool/uploads/.gitkeep | 1 +
10 files changed, 466 insertions(+)
create mode 100644 pronunciation_practice_tool/__init__.py
create mode 100644 pronunciation_practice_tool/app.py
create mode 100644 pronunciation_practice_tool/requirements.txt
create mode 100644 pronunciation_practice_tool/static/app.js
create mode 100644 pronunciation_practice_tool/static/style.css
create mode 100644 pronunciation_practice_tool/templates/exercise.html
create mode 100644 pronunciation_practice_tool/templates/index.html
create mode 100644 pronunciation_practice_tool/tests/__init__.py
create mode 100644 pronunciation_practice_tool/tests/test_app.py
create mode 100644 pronunciation_practice_tool/uploads/.gitkeep
diff --git a/pronunciation_practice_tool/__init__.py b/pronunciation_practice_tool/__init__.py
new file mode 100644
index 0000000..e17c3f6
--- /dev/null
+++ b/pronunciation_practice_tool/__init__.py
@@ -0,0 +1 @@
+# This file makes pronunciation_practice_tool a Python package.
diff --git a/pronunciation_practice_tool/app.py b/pronunciation_practice_tool/app.py
new file mode 100644
index 0000000..6cd1d8d
--- /dev/null
+++ b/pronunciation_practice_tool/app.py
@@ -0,0 +1,147 @@
+import os
+from flask import Flask, request, jsonify, render_template, session # Added session
+from werkzeug.utils import secure_filename
+from google.cloud import speech
+import difflib
+
+app = Flask(__name__, template_folder='templates', static_folder='static')
+app.secret_key = os.urandom(24) # Necessary for session management
+
+UPLOAD_FOLDER = 'uploads'
+ALLOWED_EXTENSIONS_TEXT = {'txt'}
+ALLOWED_EXTENSIONS_AUDIO = {'mp3'}
+
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+
+# Function to transcribe audio using Google Cloud Speech-to-Text
+def transcribe_audio_gcp(audio_file_path):
+ client = speech.SpeechClient()
+
+ with open(audio_file_path, 'rb') as audio_file:
+ content = audio_file.read()
+
+ audio = speech.RecognitionAudio(content=content)
+ config = speech.RecognitionConfig(
+ language_code='en-US',
+ enable_automatic_punctuation=True
+ )
+
+ try:
+ response = client.recognize(config=config, audio=audio)
+ transcript = "".join(result.alternatives[0].transcript for result in response.results)
+ return transcript
+ except Exception as e:
+ print(f"Google Cloud Speech-to-Text Error: {e}")
+ if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"):
+ return "GCP STT Error: GOOGLE_APPLICATION_CREDENTIALS not set."
+ return f"GCP STT Error: {e}"
+
+
+def allowed_file(filename, allowed_extensions):
+ return '.' in filename and \
+ filename.rsplit('.', 1)[1].lower() in allowed_extensions
+
+@app.route('/')
+def index():
+ return render_template('index.html')
+
+@app.route('/upload', methods=['POST'])
+def upload_files():
+ if 'text-file' not in request.files or 'audio-file' not in request.files:
+ session['pronunciation_issues'] = [] # Clear/initialize session on error
+ return jsonify({'error': 'Missing file part(s)'}), 400
+
+ text_file = request.files['text-file']
+ audio_file = request.files['audio-file']
+
+ if text_file.filename == '' or audio_file.filename == '':
+ session['pronunciation_issues'] = [] # Clear/initialize session on error
+ return jsonify({'error': 'No selected file(s)'}), 400
+
+ text_filename = ''
+ audio_filename = ''
+ uploaded_text_content = ""
+ transcript = ""
+ diff_output_list = []
+ pronunciation_issues = []
+
+ # Always initialize session variable at the start of processing
+ session['pronunciation_issues'] = []
+
+ if text_file and allowed_file(text_file.filename, ALLOWED_EXTENSIONS_TEXT):
+ text_filename = secure_filename(text_file.filename)
+ text_filepath = os.path.join(app.config['UPLOAD_FOLDER'], text_filename)
+ text_file.save(text_filepath)
+ try:
+ with open(text_filepath, 'r', encoding='utf-8') as f:
+ uploaded_text_content = f.read()
+ except Exception as e:
+ print(f"Error reading text file: {e}")
+ # uploaded_text_content will remain empty
+ else:
+ session['pronunciation_issues'] = [] # Clear/initialize session on error
+ return jsonify({'error': 'Invalid text file type. Please upload a .txt file.'}), 400
+
+ if audio_file and allowed_file(audio_file.filename, ALLOWED_EXTENSIONS_AUDIO):
+ audio_filename = secure_filename(audio_file.filename)
+ audio_filepath = os.path.join(app.config['UPLOAD_FOLDER'], audio_filename)
+ audio_file.save(audio_filepath)
+
+ transcript = transcribe_audio_gcp(audio_filepath)
+
+ if uploaded_text_content and transcript and not transcript.startswith("GCP STT Error:"):
+ original_lines = uploaded_text_content.splitlines()
+ transcript_lines = transcript.splitlines()
+
+ d = difflib.Differ()
+ diff = list(d.compare(original_lines, transcript_lines))
+ diff_output_list = list(diff) # Keep full diff for frontend display
+
+ # Process diff to extract pronunciation issues for session
+ for line in diff:
+ if line.startswith('- ') or line.startswith('+ '):
+ issue_text = line[2:].strip()
+ if issue_text:
+ pronunciation_issues.append(issue_text)
+
+ session['pronunciation_issues'] = pronunciation_issues
+
+ # If transcription failed or original text was empty, pronunciation_issues remains empty (as initialized)
+ # No specific else needed here as session['pronunciation_issues'] is already []
+
+ else:
+ # This block is for invalid audio file type
+ if text_filename and os.path.exists(os.path.join(app.config['UPLOAD_FOLDER'], text_filename)):
+ os.remove(os.path.join(app.config['UPLOAD_FOLDER'], text_filename))
+ # session['pronunciation_issues'] is already [] due to initialization at the start or after text file error
+ return jsonify({'error': 'Invalid audio file type. Please upload an .mp3 file.'}), 400
+
+ # Final check: if, for any reason, effective processing didn't happen
+ # (e.g. transcript error after successful file uploads but before diff)
+ # ensure issues are empty.
+ if not (uploaded_text_content and transcript and not transcript.startswith("GCP STT Error:")):
+ session['pronunciation_issues'] = []
+
+ return jsonify({
+ 'message': 'Files processed successfully',
+ 'text_file': text_filename,
+ 'audio_file': audio_filename,
+ 'text_content': uploaded_text_content,
+ 'transcribed_text': transcript,
+ 'differences': diff_output_list
+ }), 200
+
+@app.route('/exercise')
+def exercise_page():
+ issues = session.get('pronunciation_issues', [])
+ # The prompt mentions:
+ # if not issues: # If issues list is empty or not found
+ # # Maybe redirect to home or show a message that no issues were found for practice
+ # # For now, we'll pass the empty list and let the template handle it.
+ # pass
+ # This logic is handled by passing issues (which can be empty) to the template.
+ return render_template('exercise.html', issues=issues)
+
+if __name__ == '__main__':
+ app.run(debug=True)
diff --git a/pronunciation_practice_tool/requirements.txt b/pronunciation_practice_tool/requirements.txt
new file mode 100644
index 0000000..78058f1
--- /dev/null
+++ b/pronunciation_practice_tool/requirements.txt
@@ -0,0 +1,3 @@
+Flask>=2.0
+google-cloud-speech>=2.0
+werkzeug>=2.0
diff --git a/pronunciation_practice_tool/static/app.js b/pronunciation_practice_tool/static/app.js
new file mode 100644
index 0000000..7b96b87
--- /dev/null
+++ b/pronunciation_practice_tool/static/app.js
@@ -0,0 +1,112 @@
+document.getElementById('upload-form').addEventListener('submit', async function(event) {
+ event.preventDefault();
+ const textFile = document.getElementById('text-file').files[0];
+ const audioFile = document.getElementById('audio-file').files[0];
+ const resultsDiv = document.getElementById('results');
+ resultsDiv.innerHTML = ''; // Clear previous results
+
+ if (!textFile || !audioFile) {
+ alert('Please select both a TXT text file and an MP3 audio file.');
+ return;
+ }
+
+ const formData = new FormData();
+ formData.append('text-file', textFile);
+ formData.append('audio-file', audioFile);
+
+ try {
+ const response = await fetch('/upload', {
+ method: 'POST',
+ body: formData
+ });
+
+ const result = await response.json();
+
+ if (response.ok) {
+ let diffHtml = '';
+ if (result.differences && result.differences.length > 0) {
+ diffHtml = result.differences.map(line => {
+ let style = '';
+ let prefix = line.substring(0, 2);
+ let displayLine = line.substring(2); // Remove prefix for display
+
+ // Handle lines that might be shorter than 2 chars (e.g. empty lines in diff)
+ // This check was slightly different from my previous version.
+ if (line.length < 2) {
+ prefix = '';
+ displayLine = line;
+ }
+
+ if (prefix === '+ ') {
+ style = 'color: green; background-color: #e6ffe6; display: block; white-space: pre-wrap;';
+ } else if (prefix === '- ') {
+ style = 'color: red; background-color: #ffe6e6; display: block; white-space: pre-wrap;';
+ } else if (prefix === '? ') {
+ style = 'color: blue; background-color: #e6e6ff; display: block; white-space: pre-wrap;';
+ } else {
+ // Common lines (no prefix from difflib.Differ like ' ')
+ style = 'display: block; white-space: pre-wrap;';
+ }
+ // Escape HTML to prevent XSS
+ displayLine = displayLine.replace(/&/g, "&").replace(//g, ">");
+ // My previous version returned prefix + displayLine, this one returns displayLine only for styled lines, but the prompt's image shows prefix.
+ // The prompt's textual JS example for map returns `${line}` which is line *before* `substring(2)`.
+ // Reconciling: The prompt's JS example for the map function is: return `${line}`;
+ // This implies 'line' *still contains* the prefix.
+ // However, the text description says "line = line.substring(2); // Remove prefix for display"
+ // Let's stick to the JS code block provided in the prompt, which uses the original `line` for the span content.
+ // This means the prefix will be part of the span, and the color will apply to it too.
+ // The prompt text description was: return `${line}`; (where line was substringed)
+ // The prompt's code block was: return `${line}`; (where line was NOT substringed for the span content)
+ // Let's use the version from the prompt's code block, which seems more complete.
+ // The provided code in the prompt is:
+ // line = line.substring(2); // Remove prefix for display
+ // ...
+ // return `${line}`;
+ // This means `displayLine` should be used.
+ return `${displayLine}`; // Using displayLine (line without prefix)
+ }).join('
'); // Using
as per prompt's JS
+ } else if (result.transcribed_text && !result.transcribed_text.startsWith("GCP STT Error:")) {
+ // If there are no differences, it means the texts are identical or one is empty.
+ if (result.text_content === result.transcribed_text) {
+ diffHtml = "
Texts are identical!
"; + } else if (result.transcribed_text) { // Check if transcription was successful + diffHtml = "No significant differences found by difflib. Texts may be very similar or one might be a subset of the other with no conflicting lines.
"; + } else { + diffHtml = "Comparison not performed (e.g., transcription failed or texts were empty).
"; + } + } else { + diffHtml = "Could not generate differences. Transcription might have failed or texts were empty.
" + } + + resultsDiv.innerHTML = `${result.message}
+Text file: ${result.text_file}
+Audio file: ${result.audio_file}
+${result.text_content}
+ ${result.transcribed_text || "Transcription not available."}
+ Error: ${result.error}
`; + console.error('Upload failed:', result); + } + } catch (error) { + resultsDiv.innerHTML = `An unexpected error occurred: ${error.toString()}
`; + console.error('Network or unexpected error:', error); + } +}); diff --git a/pronunciation_practice_tool/static/style.css b/pronunciation_practice_tool/static/style.css new file mode 100644 index 0000000..8859d57 --- /dev/null +++ b/pronunciation_practice_tool/static/style.css @@ -0,0 +1,12 @@ +/* style.css */ +body { font-family: Arial, sans-serif; margin: 0; padding: 20px; background-color: #f4f4f4; color: #333; } +#upload-form div { margin-bottom: 15px; } +#upload-form label { display: block; margin-bottom: 5px; font-weight: bold; } +#upload-form input[type="file"], #upload-form button { padding: 10px; border-radius: 5px; border: 1px solid #ddd; } +#upload-form button { background-color: #007bff; color: white; cursor: pointer; } +#upload-form button:hover { background-color: #0056b3; } +#results { margin-top: 20px; padding: 15px; background-color: #fff; border: 1px solid #ddd; border-radius: 5px; } +#results pre { white-space: pre-wrap; word-wrap: break-word; background-color: #f9f9f9; padding: 10px; border: 1px solid #eee; border-radius: 4px;} +#diff-output span { display: block; padding: 2px 0; } /* Make each diff line take full width */ +.exercise-link-button { display: inline-block; margin-top: 15px; padding: 10px 15px; background-color: #28a745; color: white; text-decoration: none; border-radius: 5px; } +.exercise-link-button:hover { background-color: #218838; } diff --git a/pronunciation_practice_tool/templates/exercise.html b/pronunciation_practice_tool/templates/exercise.html new file mode 100644 index 0000000..ef28867 --- /dev/null +++ b/pronunciation_practice_tool/templates/exercise.html @@ -0,0 +1,39 @@ + + + + + +Here are some words or phrases identified for your practice based on your last reading:
+No specific pronunciation issues were identified from your last session, or you haven't uploaded a file yet. Try uploading a text and audio file!
+ {% endif %} + + Upload New Files +