send initialCanonicalUrl in array format to fix crawler issues (#69370)

The initial RSC payload currently provides the `initialCanonicalUrl` value to initialize the router state during the SSR render. However as noted in #53274, it seems crawlers like Googlebot will crawl anything URL-like that it finds in the document, regardless of whether or not it's actually rendered as a link somewhere (which this isn't). This splits the `initialCanonicalUrl` value into parts and then joins it when it's ready to be consumed. That way when it's embedded in the document, it's not in a path-like format. Fixes #53274 Closes NDX-243
vercel · Aug 27, 2024 · 7f57d4b · 7f57d4b
1 parent bca90db
commit 7f57d4b
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 11 deletions.
diff --git a/packages/next/src/client/app-index.tsx b/packages/next/src/client/app-index.tsx
@@ -171,7 +171,7 @@ const pendingActionQueue: Promise<AppRouterActionQueue> = new Promise(
             createInitialRouterState({
               buildId: initialRSCPayload.b,
               initialFlightData: initialRSCPayload.f,
-              initialCanonicalUrl: initialRSCPayload.c,
+              initialCanonicalUrlParts: initialRSCPayload.c,
               initialParallelRoutes: new Map(),
               location: window.location,
               couldBeIntercepted: initialRSCPayload.i,

diff --git a/packages/next/src/client/components/router-reducer/create-initial-router-state.test.tsx b/packages/next/src/client/components/router-reducer/create-initial-router-state.test.tsx
@@ -38,7 +38,7 @@ describe('createInitialRouterState', () => {
       initialFlightData: [
         [initialTree, ['', children, {}, null], <title>Test</title>],
       ],
-      initialCanonicalUrl,
+      initialCanonicalUrlParts: initialCanonicalUrl.split('/'),
       initialParallelRoutes,
       location: new URL('/linking', 'https://localhost') as any,
       couldBeIntercepted: false,
@@ -50,7 +50,7 @@ describe('createInitialRouterState', () => {
       initialFlightData: [
         [initialTree, ['', children, {}, null], <title>Test</title>],
       ],
-      initialCanonicalUrl,
+      initialCanonicalUrlParts: initialCanonicalUrl.split('/'),
       initialParallelRoutes,
       location: new URL('/linking', 'https://localhost') as any,
       couldBeIntercepted: false,

diff --git a/packages/next/src/client/components/router-reducer/create-initial-router-state.ts b/packages/next/src/client/components/router-reducer/create-initial-router-state.ts
@@ -10,7 +10,7 @@ import { addRefreshMarkerToActiveParallelSegments } from './refetch-inactive-par
 
 export interface InitialRouterStateParameters {
   buildId: string
-  initialCanonicalUrl: string
+  initialCanonicalUrlParts: string[]
   initialParallelRoutes: CacheNode['parallelRoutes']
   initialFlightData: FlightDataPath[]
   location: Location | null
@@ -21,12 +21,16 @@ export interface InitialRouterStateParameters {
 export function createInitialRouterState({
   buildId,
   initialFlightData,
-  initialCanonicalUrl,
+  initialCanonicalUrlParts,
   initialParallelRoutes,
   location,
   couldBeIntercepted,
   postponed,
 }: InitialRouterStateParameters) {
+  // When initialized on the server, the canonical URL is provided as an array of parts.
+  // This is to ensure that when the RSC payload streamed to the client, crawlers don't interpret it
+  // as a URL that should be crawled.
+  const initialCanonicalUrl = initialCanonicalUrlParts.join('/')
   // The initialFlightData is an array of FlightDataPath arrays.
   // For the root render, there'll only be a top-level FlightDataPath array.
   const [initialTree, initialSeedData, initialHead] = initialFlightData[0]

diff --git a/packages/next/src/server/app-render/app-render.tsx b/packages/next/src/server/app-render/app-render.tsx
@@ -482,6 +482,16 @@ async function generateDynamicFlightRenderResult(
   })
 }
 
+/**
+ * Crawlers will inadvertently think the canonicalUrl in the RSC payload should be crawled
+ * when our intention is to just seed the router state with the current URL.
+ * This function splits up the pathname so that we can later join it on
+ * when we're ready to consume the path.
+ */
+function prepareInitialCanonicalUrl(url: RequestStore['url']) {
+  return (url.pathname + url.search).split('/')
+}
+
 // This is the data necessary to render <AppRouter /> when no SSR errors are encountered
 async function getRSCPayload(
   tree: LoaderTree,
@@ -558,7 +568,7 @@ async function getRSCPayload(
     P: <Preloads preloadCallbacks={preloadCallbacks} />,
     b: ctx.renderOpts.buildId,
     p: ctx.assetPrefix,
-    c: url.pathname + url.search,
+    c: prepareInitialCanonicalUrl(url),
     i: !!couldBeIntercepted,
     f: [[initialTree, seedData, initialHead]],
     m: missingSlots,
@@ -635,7 +645,7 @@ async function getErrorRSCPayload(
   return {
     b: ctx.renderOpts.buildId,
     p: ctx.assetPrefix,
-    c: url.pathname + url.search,
+    c: prepareInitialCanonicalUrl(url),
     m: undefined,
     i: false,
     f: [[initialTree, initialSeedData, initialHead]],
@@ -669,7 +679,7 @@ function App<T>({
   const initialState = createInitialRouterState({
     buildId: response.b,
     initialFlightData: response.f,
-    initialCanonicalUrl: response.c,
+    initialCanonicalUrlParts: response.c,
     // location and initialParallelRoutes are not initialized in the SSR render
     // they are set to an empty map and window.location, respectively during hydration
     initialParallelRoutes: null!,
@@ -727,7 +737,7 @@ function AppWithoutContext<T>({
   const initialState = createInitialRouterState({
     buildId: response.b,
     initialFlightData: response.f,
-    initialCanonicalUrl: response.c,
+    initialCanonicalUrlParts: response.c,
     // location and initialParallelRoutes are not initialized in the SSR render
     // they are set to an empty map and window.location, respectively during hydration
     initialParallelRoutes: null!,

diff --git a/packages/next/src/server/app-render/types.ts b/packages/next/src/server/app-render/types.ts
@@ -206,8 +206,8 @@ export type InitialRSCPayload = {
   b: string
   /** assetPrefix */
   p: string
-  /** initialCanonicalUrl */
-  c: string
+  /** initialCanonicalUrlParts */
+  c: string[]
   /** couldBeIntercepted */
   i: boolean
   /** initialFlightData */