final-project-deliverables/meeting.tex at main · Final-Project-ROS2/final-project-deliverables · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
\documentclass[12pt]{article}
\usepackage{siunitx}

\begin{document}

\section*{Internal Team Meeting Agenda: Metrics \& Project Alignment}

\subsection*{Meeting Objectives}
\begin{itemize}
    \item Align the team on \textbf{what exactly we are testing} in the project
    \item Define \textbf{clear, measurable metrics} (KPIs) for both software and hardware components
    \item Establish \textbf{minimum passing criteria and failure conditions}
    \item Identify \textbf{existing benchmarks and prior work} to ground our evaluation
    \item Agree on a concrete \textbf{short-term action plan} for the next week
\end{itemize}

\subsection*{1. Project Goal Re-alignment}
\begin{itemize}
    \item Restate the core goal of the second semester project
    \item Clarify a measurable goal for the overall system:
    \begin{itemize}
      \item Objective from first semester:
      To demonstrate successful end-to-end handover, the system must integrate language understanding, symbolic planning, perception, and grasping into a reliable pipeline. Recent studies show that robot-to-human tool handover can reach around 92.5\% success in simulation for construction tools~\cite{iaarc2025_handover}, while sim-to-real grasping frameworks report 90--97\% success depending on object familiarity~\cite{mogpe2022},\cite{grasping2023}. Performance is typically lower for novel or cluttered settings, highlighting the importance of generalization.

      Timing benchmarks suggest that simple object handovers can be completed in 8--10 seconds~\cite{handover2024_fast}, though more complex engineering tools may reasonably require up to 15 seconds. Based on these findings, this project defines its main performance goal as achieving \textbf{$\geq90\%$ success over 60–80 trials}, with \textbf{completion within 15 seconds} and \textbf{minimal disturbance} $\leq\SI{2}{\centi\meter}$ to non-target tools.
    \end{itemize}

\end{itemize}

\subsection*{2. Define the Final System-Level Test}
\begin{itemize}
    \item Describe the \textbf{final end-to-end task} the system must perform
    \item Identify real-world inspired task scenarios to benchmark against
    \item Discuss:
    \begin{itemize}
        \item What success looks like
        \item What constitutes a clear failure
    \end{itemize}
    \item Decide on observable outputs for evaluation
\end{itemize}

\subsection*{3. Metrics and KPIs Definition}
\textbf{Discussion focus: be specific, measurable, and benchmarked}
\begin{itemize}
    \item Performance metrics:
    \begin{itemize}
        \item Task completion time
        \item Planning latency
        \item Execution success rate
    \end{itemize}
    \item Accuracy metrics:
    \begin{itemize}
        \item Vision perception accuracy
        \item Correctness of generated PDDL
    \end{itemize}
    \item Robustness metrics:
    \begin{itemize}
        \item Failure recovery rate
        \item Sensitivity to perception errors
    \end{itemize}
    \item Define:
    \begin{itemize}
        \item Minimum passing criteria
        \item Failing conditions
    \end{itemize}
\end{itemize}

\subsection*{4. Component-Level Evaluation Plan}
\begin{itemize}
    \item Vision module evaluation
    \item Ontology-based PDDL generation evaluation
    \item Planner integration with vision outputs
    \item Decide what to test independently vs. end-to-end
\end{itemize}

\subsection*{5. Benchmarks and Prior Work}
\begin{itemize}
    \item Identify existing benchmarks or datasets to reuse
    \item Review prior work on:
    \begin{itemize}
        \item Automatic PDDL generation
        \item Ontology-driven task planning
        \item Vision-planning integration
    \end{itemize}
    \item Decide which benchmarks are realistic and defensible
\end{itemize}

\subsection*{6. One-Week Brainstorming Phase (No Development)}
\begin{itemize}
    \item Agree on a one-week period focused on:
    \begin{itemize}
        \item Metric refinement
        \item Literature review
        \item Benchmark selection
    \end{itemize}
    \item Define expected outputs at the end of the week
\end{itemize}

\subsection*{7. Action Items and Ownership}
\begin{itemize}
    \item Assign responsibilities for:
    \begin{itemize}
        \item Metric definition
        \item Benchmark research
        \item Ontology and PDDL review
        \item Vision-planning integration analysis
    \end{itemize}
    \item Set internal deadlines and next meeting date
\end{itemize}

\subsection*{8. Wrap-up}
\begin{itemize}
    \item Confirm shared understanding of goals and metrics
    \item Identify open questions or risks
    \item Align on next steps
\end{itemize}

\newpage
\section{Pre-Advisor Meeting Worksheet: Measurable Objectives and Ownership}

\subsection*{Instructions}
Each subsection below must be completed before the advisor meeting.
All metrics must be:
\begin{itemize}
    \item Quantitative
    \item Measurable with a defined method
    \item Comparable to prior work or benchmarks
\end{itemize}

\subsection{Image Processing (Perception)}

\subsubsection{Objectives}
\begin{itemize}
    \item What perception capabilities are required for the final system task?\\
    \rule{\linewidth}{0.4pt}
\end{itemize}

\subsubsection{Metrics and Measurement}
\begin{center}
\begin{tabular}{|p{4cm}|p{3cm}|p{3cm}|p{4cm}|}
\hline
\textbf{Metric} & \textbf{Unit} & \textbf{Target} & \textbf{Measurement Method} \\
\hline
Object detection accuracy &  &  &  \\
\hline
Pose estimation error &  &  &  \\
\hline
Perception latency &  &  &  \\
\hline
\end{tabular}
\end{center}

\subsubsection{Passing and Failure Criteria}
\begin{itemize}
    \item Minimum passing criteria:\\
    \rule{0.9\linewidth}{0.4pt}
    \item Failure conditions:\\
    \rule{0.9\linewidth}{0.4pt}
\end{itemize}

\subsubsection{Benchmarks / References}
\begin{itemize}
    \item Prior work or dataset used for comparison:\\
    \rule{0.9\linewidth}{0.4pt}
\end{itemize}

%----------------------------------------------------

\subsection{Speech Processing}

\subsubsection{Objectives}
\begin{itemize}
    \item What role does speech play in the final system behavior?\\
    \rule{\linewidth}{0.4pt}
\end{itemize}

\subsubsection{Metrics and Measurement}
\begin{center}
\begin{tabular}{|p{4cm}|p{3cm}|p{3cm}|p{4cm}|}
\hline
\textbf{Metric} & \textbf{Unit} & \textbf{Target} & \textbf{Measurement Method} \\
\hline
Command recognition accuracy &  &  &  \\
\hline
End-to-end response latency &  &  &  \\
\hline
Robustness to noise &  &  &  \\
\hline
\end{tabular}
\end{center}

\subsubsection{Passing and Failure Criteria}
\begin{itemize}
    \item Minimum passing criteria:\\
    \rule{0.9\linewidth}{0.4pt}
    \item Failure conditions:\\
    \rule{0.9\linewidth}{0.4pt}
\end{itemize}

\subsubsection{Benchmarks / References}
\begin{itemize}
    \item Dataset, baseline system, or literature benchmark:\\
    \rule{0.9\linewidth}{0.4pt}
\end{itemize}

%----------------------------------------------------

\subsection{Planning Algorithm (Ontology \& PDDL)}

\subsubsection{Objectives}
\begin{itemize}
\item Demonstrate that natural language instructions combined with visual perception can be reliably translated into \textbf{syntactically correct and solvable PDDL problem instances}, enabling classical planners to generate executable task plans. This objective follows prior work showing that vision--language-to-PDDL pipelines can achieve near-perfect planning feasibility across multiple domains when ontology grounding and verification loops are used \cite{planowl}.
\end{itemize}

\subsubsection{Metrics and Measurement}
\begin{center}
\begin{tabular}{|p{3.8cm}|p{2.2cm}|p{3.2cm}|p{5cm}|}
\hline
\textbf{Metric} & \textbf{Unit} & \textbf{Target} & \textbf{Measurement Method} \\
\hline

Planning success rate ($F_{plan}$) &
\% &
97\%&
Percentage of tasks for which a valid plan is found that reaches the goal without constraint violations. Uses the binary feasibility metric $F_{plan}$ (1 = success, 0 = failure) as defined in PlanOwl~\cite{planowl}. \\
\hline

PDDL syntax correctness ($F_{syn}$) &
binary (0/1) &
98\% &
Domain and problem files must be fully parsable by a PDDL planner. Measured using the syntax correctness metric $F_{syn}$, requiring both files to be valid, as used in PlanOwl~\cite{planowl}. \\
\hline

Semantic grounding accuracy ($F_{pd1}$) &
F-score &
95\% &
Balanced F-score comparing generated PDDL objects, initial states, and goal predicates against ground-truth problem instances. Precision and recall are computed over symbolic elements, and the final score is their harmonic mean, following the definition in PlanOwl \cite{planowl}. \\
\hline

\end{tabular}
\end{center}

The balanced F-score $F_{pd1}$ is defined as the harmonic mean of precision and recall over symbolic planning elements (objects, initial predicates, and goal predicates):
\[
F_{pd1} = \frac{2 \cdot P \cdot R}{P + R}
\]
where precision $P$ measures the fraction of generated symbols that are correct, and recall $R$ measures the fraction of ground-truth symbols correctly generated. Prior work reports $F_{pd1} = 1.0$ when perception and grounding are correct, with deviations attributed to upstream perception errors \cite{planowl}.

\subsubsection{Passing and Failure Criteria}
\begin{itemize}
\item Minimum passing criteria:\
A task is considered passing if the generated PDDL domain and problem files are syntactically valid and a classical planner can generate a plan that reaches the intended goal state without unintended actions, consistent with prior PDDL-based validation practices \cite{planowl,mbse}.
\item Failure conditions:\
Failures include (i) invalid or non-parsable PDDL files, (ii) no plan found within established planner time limits, or (iii) semantic inconsistencies between perceived objects and ontology constraints (e.g., infeasible grasp or tool assignments), as reported in prior ontology-based and demonstration-driven planning systems \cite{diehl, ontology}.
\end{itemize}

\subsubsection{Benchmarks / References}
\begin{itemize}
\item Vision--language to PDDL planning benchmarks in Block-world, Object Arrangement, and Hanoi domains \cite{planowl}.
\item Might have to use slight modifications so the current depth camera position make sense
\item Achieved Numbers~\cite{planowl}:
\begin{itemize}
    \item \textbf{Block-World}: The system achieved perfect scores (100\%) for F-score, Plan Correctness, and Syntax Correctness
    \item \textbf{Object Arrangement (Standard)}: For the standard, Large Number of Objects (LO), and Predefined Relationships (R) variants, PlanOwl achieved perfect scores (100\%) across all three metrics
    \item \textbf{Object Arrangement (Multiple Objects)}: The system achieved 98.8\% for F-score and Plan Correctness, and 98.4\% for Syntax Correctness
    \item \textbf{Hanoi}: The system achieved an F-score of 95.6\%, Plan Correctness of 97.4\%, and Syntax Correctness of 98.4\%
\end{itemize}
\end{itemize}

%----------------------------------------------------

\subsection{Detailed Plan for Improving PDDL Generation}

\subsubsection{Related Work}
\begin{itemize}
    \item Most approaches uses LLM/VLM to generate PDDL
    \item Ontology or knowledge graph are used to augment the generation process, which is still done by LLM/VLM
    \item Ontology or knowledge graph are created manually
    \item Most approaches using LLM/VLM to generate PDDL use techniques like few-shot learning, prompt engineering, iterative refinement, and feedback loops to improve the generation quality
\end{itemize}

\subsubsection{Plan for Improvement}
\begin{itemize}
    \item Test current approach with visual capabilities since previous test was done without visual capabilities
    \item Current limitation is due to difficulty modeling sequential process in PDDL
    \item The point of using PDDL is to help with LLM struggling with long-term and complex task planning. The key is the PDDL produced must be correct and useful for planning.
\end{itemize}

%----------------------------------------------------

\subsection{End-to-End System Evaluation}

\begin{itemize}
    \item Number of trials: \rule{4cm}{0.4pt}
    \item Overall success rate target: \rule{4cm}{0.4pt}
    \item Maximum allowed task duration: \rule{4cm}{0.4pt}
    \item Failure modes observed:\\
    \rule{0.9\linewidth}{0.4pt}
\end{itemize}

%----------------------------------------------------

\subsection{Responsibility and Ownership}

\textbf{Each section must have a clearly responsible lead.}

\begin{center}
\begin{tabular}{|p{5cm}|p{4cm}|p{5cm}|}
\hline
\textbf{Worksheet Section} & \textbf{Primary Owner} & \textbf{Supporting Members} \\
\hline
Image Processing & MB &  \\
\hline
Speech Processing & Pory &  \\
\hline
Planning Algorithm & Kong &  \\
\hline
End-to-End Evaluation & Pory & Kong  \\
\hline
\end{tabular}
\end{center}

\subsection{G. Questions for Advisor}
\begin{itemize}
    \item \rule{0.95\linewidth}{0.4pt}
    \item \rule{0.95\linewidth}{0.4pt}
\end{itemize}

\newpage
\bibliographystyle{IEEEtran}
\bibliography{ref}

\end{document}