chore: document mcp tools (#35258)

2025-03-18 16:59:56 -07:00 · 2025-03-18 16:59:56 -07:00 · 23b5b05f67
parent 0a3387fda3
commit 23b5b05f67
3 changed files with 213 additions and 17 deletions
--- a/packages/playwright-mcp/README.md
+++ b/packages/playwright-mcp/README.md
@ -1,4 +1,4 @@
-### Playwright MCP
+## Playwright MCP

 This package is experimental and not yet ready for production use.
 It is a subject to change and will not respect semver versioning.
@ -62,3 +62,162 @@ And then in MCP config, add following to the `env`:
  }
 }
 ```
+
+### Tool Modes
+
+The tools are available in two modes:
+
+1. **Snapshot Mode** (default): Uses accessibility snapshots for better performance and reliability
+2. **Vision Mode**: Uses screenshots for visual-based interactions
+
+To use Vision Mode, add the `--vision` flag when starting the server:
+
+```js
+{
+  "mcpServers": {
+    "playwright": {
+      "command": "npx",
+      "args": [
+        "@playwright/mcp",
+        "--vision"
+      ]
+    }
+  }
+}
+```
+
+Vision Mode works best with the computer use models that are able to interact with elements using
+X Y coordinate space, based on the provided screenshot.
+
+### Snapshot Mode
+
+The Playwright MCP provides a set of tools for browser automation. Here are all available tools:
+
+- **browser_navigate**
+  - Description: Navigate to a URL
+  - Parameters:
+    - `url` (string): The URL to navigate to
+
+- **browser_go_back**
+  - Description: Go back to the previous page
+  - Parameters: None
+
+- **browser_go_forward**
+  - Description: Go forward to the next page
+  - Parameters: None
+
+- **browser_click**
+  - Description: Perform click on a web page
+  - Parameters:
+    - `element` (string): Human-readable element description used to obtain the permission to interact with the element
+    - `ref` (string): Exact target element reference from the page snapshot
+
+- **browser_hover**
+  - Description: Hover over element on page
+  - Parameters:
+    - `element` (string): Human-readable element description used to obtain the permission to interact with the element
+    - `ref` (string): Exact target element reference from the page snapshot
+
+- **browser_drag**
+  - Description: Perform drag and drop between two elements
+  - Parameters:
+    - `startElement` (string): Human-readable source element description used to obtain the permission to interact with the element
+    - `startRef` (string): Exact source element reference from the page snapshot
+    - `endElement` (string): Human-readable target element description used to obtain the permission to interact with the element
+    - `endRef` (string): Exact target element reference from the page snapshot
+
+- **browser_type**
+  - Description: Type text into editable element
+  - Parameters:
+    - `element` (string): Human-readable element description used to obtain the permission to interact with the element
+    - `ref` (string): Exact target element reference from the page snapshot
+    - `text` (string): Text to type into the element
+    - `submit` (boolean): Whether to submit entered text (press Enter after)
+
+- **browser_press_key**
+  - Description: Press a key on the keyboard
+  - Parameters:
+    - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a`
+
+- **browser_snapshot**
+  - Description: Capture accessibility snapshot of the current page (better than screenshot)
+  - Parameters: None
+
+- **browser_save_as_pdf**
+  - Description: Save page as PDF
+  - Parameters: None
+
+- **browser_wait**
+  - Description: Wait for a specified time in seconds
+  - Parameters:
+    - `time` (number): The time to wait in seconds (capped at 10 seconds)
+
+- **browser_close**
+  - Description: Close the page
+  - Parameters: None
+
+
+### Vision Mode
+
+Vision Mode provides tools for visual-based interactions using screenshots. Here are all available tools:
+
+- **browser_navigate**
+  - Description: Navigate to a URL
+  - Parameters:
+    - `url` (string): The URL to navigate to
+
+- **browser_go_back**
+  - Description: Go back to the previous page
+  - Parameters: None
+
+- **browser_go_forward**
+  - Description: Go forward to the next page
+  - Parameters: None
+
+- **browser_screenshot**
+  - Description: Capture screenshot of the current page
+  - Parameters: None
+
+- **browser_move_mouse**
+  - Description: Move mouse to specified coordinates
+  - Parameters:
+    - `x` (number): X coordinate
+    - `y` (number): Y coordinate
+
+- **browser_click**
+  - Description: Click at specified coordinates
+  - Parameters:
+    - `x` (number): X coordinate to click at
+    - `y` (number): Y coordinate to click at
+
+- **browser_drag**
+  - Description: Perform drag and drop operation
+  - Parameters:
+    - `startX` (number): Start X coordinate
+    - `startY` (number): Start Y coordinate
+    - `endX` (number): End X coordinate
+    - `endY` (number): End Y coordinate
+
+- **browser_type**
+  - Description: Type text at specified coordinates
+  - Parameters:
+    - `text` (string): Text to type
+    - `submit` (boolean): Whether to submit entered text (press Enter after)
+
+- **browser_press_key**
+  - Description: Press a key on the keyboard
+  - Parameters:
+    - `key` (string): Name of the key to press or a character to generate, such as `ArrowLeft` or `a`
+
+- **browser_save_as_pdf**
+  - Description: Save page as PDF
+  - Parameters: None
+
+- **browser_wait**
+  - Description: Wait for a specified time in seconds
+  - Parameters:
+    - `time` (number): The time to wait in seconds (capped at 10 seconds)
+
+- **browser_close**
+  - Description: Close the page
+  - Parameters: None
--- a/packages/playwright-mcp/src/tools/screenshot.ts
+++ b/packages/playwright-mcp/src/tools/screenshot.ts
@ -38,7 +38,7 @@ export const screenshot: Tool = {
 };

 const elementSchema = z.object({
-  element: z.string().describe('Element label, description or any other text to describe the element'),
+  element: z.string().describe('Human-readable element description used to obtain the permission to interact with the element'),
 });

 const moveMouseSchema = elementSchema.extend({
@ -63,15 +63,22 @@ export const moveMouse: Tool = {
  },
 };

+const clickSchema = elementSchema.extend({
+  x: z.number().describe('X coordinate'),
+  y: z.number().describe('Y coordinate'),
+});
+
 export const click: Tool = {
  schema: {
    name: 'browser_click',
    description: 'Click left mouse button',
-    inputSchema: zodToJsonSchema(elementSchema),
+    inputSchema: zodToJsonSchema(clickSchema),
  },

-  handle: async context => {
+  handle: async (context, params) => {
    await runAndWait(context, async page => {
+      const validatedParams = clickSchema.parse(params);
+      await page.mouse.move(validatedParams.x, validatedParams.y);
      await page.mouse.down();
      await page.mouse.up();
    });
@ -82,8 +89,10 @@ export const click: Tool = {
 };

 const dragSchema = elementSchema.extend({
-  x: z.number().describe('X coordinate'),
-  y: z.number().describe('Y coordinate'),
+  startX: z.number().describe('Start X coordinate'),
+  startY: z.number().describe('Start Y coordinate'),
+  endX: z.number().describe('End X coordinate'),
+  endY: z.number().describe('End Y coordinate'),
 });

 export const drag: Tool = {
@ -96,18 +105,20 @@ export const drag: Tool = {
  handle: async (context, params) => {
    const validatedParams = dragSchema.parse(params);
    await runAndWait(context, async page => {
+      await page.mouse.move(validatedParams.startX, validatedParams.startY);
      await page.mouse.down();
-      await page.mouse.move(validatedParams.x, validatedParams.y);
+      await page.mouse.move(validatedParams.endX, validatedParams.endY);
      await page.mouse.up();
    });
    return {
-      content: [{ type: 'text', text: `Dragged mouse to (${validatedParams.x}, ${validatedParams.y})` }],
+      content: [{ type: 'text', text: `Dragged mouse from (${validatedParams.startX}, ${validatedParams.startY}) to (${validatedParams.endX}, ${validatedParams.endY})` }],
    };
  },
 };

 const typeSchema = z.object({
-  text: z.string().describe('Text to type'),
+  text: z.string().describe('Text to type into the element'),
+  submit: z.boolean().describe('Whether to submit entered text (press Enter after)'),
 });

 export const type: Tool = {
@ -121,7 +132,9 @@ export const type: Tool = {
    const validatedParams = typeSchema.parse(params);
    await runAndWait(context, async page => {
      await page.keyboard.type(validatedParams.text);
-    });
+      if (validatedParams.submit)
+        await page.keyboard.press('Enter');
+    }, true);
    return {
      content: [{ type: 'text', text: `Typed text "${validatedParams.text}"` }],
    };
--- a/packages/playwright-mcp/src/tools/snapshot.ts
+++ b/packages/playwright-mcp/src/tools/snapshot.ts
@ -35,8 +35,8 @@ export const snapshot: Tool = {
 };

 const elementSchema = z.object({
-  element: z.string().describe('Element label, description of any other text to describe the element'),
-  ref: z.string().describe('Target element reference'),
+  element: z.string().describe('Human-readable element description used to obtain the permission to interact with the element'),
+  ref: z.string().describe('Exact target element reference from the page snapshot'),
 });

 export const click: Tool = {
@ -48,7 +48,31 @@ export const click: Tool = {

  handle: async (context, params) => {
    const validatedParams = elementSchema.parse(params);
-    return runAndWait(context, page => refLocator(page, validatedParams).click(), true);
+    return runAndWait(context, page => refLocator(page, validatedParams.ref).click(), true);
+  },
+};
+
+const dragSchema = z.object({
+  startElement: z.string().describe('Human-readable source element description used to obtain the permission to interact with the element'),
+  startRef: z.string().describe('Exact source element reference from the page snapshot'),
+  endElement: z.string().describe('Human-readable target element description used to obtain the permission to interact with the element'),
+  endRef: z.string().describe('Exact target element reference from the page snapshot'),
+});
+
+export const drag: Tool = {
+  schema: {
+    name: 'browser_drag',
+    description: 'Perform drag and drop between two elements',
+    inputSchema: zodToJsonSchema(dragSchema),
+  },
+
+  handle: async (context, params) => {
+    const validatedParams = dragSchema.parse(params);
+    return runAndWait(context, async page => {
+      const startLocator = refLocator(page, validatedParams.startRef);
+      const endLocator = refLocator(page, validatedParams.endRef);
+      await startLocator.dragTo(endLocator);
+    }, true);
  },
 };

@ -61,7 +85,7 @@ export const hover: Tool = {

  handle: async (context, params) => {
    const validatedParams = elementSchema.parse(params);
-    return runAndWait(context, page => refLocator(page, validatedParams).hover(), true);
+    return runAndWait(context, page => refLocator(page, validatedParams.ref).hover(), true);
  },
 };

@ -80,7 +104,7 @@ export const type: Tool = {
  handle: async (context, params) => {
    const validatedParams = typeSchema.parse(params);
    return await runAndWait(context, async page => {
-      const locator = refLocator(page, validatedParams);
+      const locator = refLocator(page, validatedParams.ref);
      await locator.fill(validatedParams.text);
      if (validatedParams.submit)
        await locator.press('Enter');
@ -88,6 +112,6 @@ export const type: Tool = {
  },
 };

-function refLocator(page: playwright.Page, params: z.infer<typeof elementSchema>): playwright.Locator {
-  return page.locator(`aria-ref=${params.ref}`);
+function refLocator(page: playwright.Page, ref: string): playwright.Locator {
+  return page.locator(`aria-ref=${ref}`);
 }